howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing as mp 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28import math 29 30from howard.functions.commons import * 31from howard.objects.database import * 32from howard.functions.databases import * 33from howard.functions.utils import * 34 35 36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(rf"^{field}$") 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 fields_to_rename: dict | None = None, 2100 ) -> bool: 2101 """ 2102 The `export_output` function exports data from a VCF file to various formats, including VCF, 2103 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2104 partitioning. 2105 2106 :param output_file: The `output_file` parameter is a string that specifies the name of the 2107 output file where the exported data will be saved 2108 :type output_file: str | None 2109 :param output_header: The `output_header` parameter is a string that specifies the name of the 2110 file where the header of the VCF file will be exported. If this parameter is not provided, the 2111 header will be exported to a file with the same name as the `output_file` parameter, but with 2112 the extension " 2113 :type output_header: str | None 2114 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2115 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2116 True, the header will be exported to a file. If `export_header` is False, the header will not 2117 be, defaults to True 2118 :type export_header: bool (optional) 2119 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2120 that can be used to filter and select specific data from the VCF file before exporting it. If 2121 provided, only the data that matches the query will be exported. This allows you to customize 2122 the exported data based on 2123 :type query: str | None 2124 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2125 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2126 organize data in a hierarchical directory structure based on the values of one or more columns. 2127 This can improve query performance when working with large datasets 2128 :type parquet_partitions: list | None 2129 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2130 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2131 multiple files. It helps in optimizing the export process by breaking down the data into 2132 manageable chunks for processing and storage 2133 :type chunk_size: int | None 2134 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2135 threads to be used during the export process. It determines the level of parallelism and can 2136 improve the performance of the export operation. If this parameter is not provided, the function 2137 will use the default number of threads 2138 :type threads: int | None 2139 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2140 determines whether the output file should be sorted based on genomic coordinates of the 2141 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2142 `False`,, defaults to False 2143 :type sort: bool (optional) 2144 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2145 determines whether an index should be created on the output file. If `index` is set to `True`, 2146 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2147 :type index: bool (optional) 2148 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2149 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2150 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2151 output file should be 2152 :type order_by: str | None 2153 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2154 mapping of field names to be renamed during the export process. This parameter allows you to 2155 customize the output field names before exporting the data. Each key-value pair in the 2156 dictionary represents the original field name as the key and the new field name 2157 :type fields_to_rename: dict | None 2158 :return: The `export_output` function returns a boolean value. It checks if the output file 2159 exists and returns True if it does, or None if it doesn't. 2160 """ 2161 2162 # Log 2163 log.info("Exporting...") 2164 2165 # Full path 2166 output_file = full_path(output_file) 2167 output_header = full_path(output_header) 2168 2169 # Config 2170 config = self.get_config() 2171 2172 # Param 2173 param = self.get_param() 2174 2175 # Tmp files to remove 2176 tmp_to_remove = [] 2177 2178 # If no output, get it 2179 if not output_file: 2180 output_file = self.get_output() 2181 2182 # If not threads 2183 if not threads: 2184 threads = self.get_threads() 2185 2186 # Rename fields 2187 if not fields_to_rename: 2188 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2189 self.rename_info_fields(fields_to_rename=fields_to_rename) 2190 2191 # Auto header name with extension 2192 if export_header or output_header: 2193 if not output_header: 2194 output_header = f"{output_file}.hdr" 2195 # Export header 2196 self.export_header(output_file=output_file) 2197 2198 # Switch off export header if VCF output 2199 output_file_type = get_file_format(output_file) 2200 if output_file_type in ["vcf"]: 2201 export_header = False 2202 tmp_to_remove.append(output_header) 2203 2204 # Chunk size 2205 if not chunk_size: 2206 chunk_size = config.get("chunk_size", None) 2207 2208 # Parquet partition 2209 if not parquet_partitions: 2210 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2211 if parquet_partitions and isinstance(parquet_partitions, str): 2212 parquet_partitions = parquet_partitions.split(",") 2213 2214 # Order by 2215 if not order_by: 2216 order_by = param.get("export", {}).get("order_by", "") 2217 2218 # Header in output 2219 header_in_output = param.get("export", {}).get("include_header", False) 2220 2221 # Database 2222 database_source = self.get_connexion() 2223 2224 # Connexion format 2225 connexion_format = self.get_connexion_format() 2226 2227 # Explode infos 2228 if self.get_explode_infos(): 2229 self.explode_infos( 2230 prefix=self.get_explode_infos_prefix(), 2231 fields=self.get_explode_infos_fields(), 2232 force=False, 2233 ) 2234 2235 # if connexion_format in ["sqlite"] or query: 2236 if connexion_format in ["sqlite"]: 2237 2238 # Export in Parquet 2239 random_tmp = "".join( 2240 random.choice(string.ascii_lowercase) for i in range(10) 2241 ) 2242 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2243 tmp_to_remove.append(database_source) 2244 2245 # Table Variants 2246 table_variants = self.get_table_variants() 2247 2248 # Create export query 2249 sql_query_export_subquery = f""" 2250 SELECT * FROM {table_variants} 2251 """ 2252 2253 # Write source file 2254 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2255 2256 # Create database 2257 database = Database( 2258 database=database_source, 2259 table="variants", 2260 header_file=output_header, 2261 conn_config=self.get_connexion_config(), 2262 ) 2263 2264 # Existing colomns header 2265 existing_columns_header = database.get_header_columns_from_database(query=query) 2266 2267 # Sample list 2268 if output_file_type in ["vcf"]: 2269 get_samples = self.get_samples() 2270 get_samples_check = self.get_samples_check() 2271 samples_force = get_samples is not None 2272 sample_list = self.get_header_sample_list( 2273 check=get_samples_check, 2274 samples=get_samples, 2275 samples_force=samples_force, 2276 ) 2277 else: 2278 sample_list = None 2279 2280 # Export file 2281 database.export( 2282 output_database=output_file, 2283 output_header=output_header, 2284 existing_columns_header=existing_columns_header, 2285 parquet_partitions=parquet_partitions, 2286 chunk_size=chunk_size, 2287 threads=threads, 2288 sort=sort, 2289 index=index, 2290 header_in_output=header_in_output, 2291 order_by=order_by, 2292 query=query, 2293 export_header=export_header, 2294 sample_list=sample_list, 2295 ) 2296 2297 # Remove 2298 remove_if_exists(tmp_to_remove) 2299 2300 return (os.path.exists(output_file) or None) and ( 2301 os.path.exists(output_file) or None 2302 ) 2303 2304 def get_extra_infos(self, table: str = None) -> list: 2305 """ 2306 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2307 in the header. 2308 2309 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2310 name of the table from which you want to retrieve the extra columns that are not present in the 2311 header. If the `table` parameter is not provided when calling the function, it will default to 2312 using the variants 2313 :type table: str 2314 :return: A list of columns that are in the specified table but not in the header of the table. 2315 """ 2316 2317 header_columns = [] 2318 2319 if not table: 2320 table = self.get_table_variants(clause="from") 2321 header_columns = self.get_header_columns() 2322 2323 # Check all columns in the database 2324 query = f""" SELECT * FROM {table} LIMIT 1 """ 2325 log.debug(f"query {query}") 2326 table_columns = self.get_query_to_df(query).columns.tolist() 2327 extra_columns = [] 2328 2329 # Construct extra infos (not in header) 2330 for column in table_columns: 2331 if column not in header_columns: 2332 extra_columns.append(column) 2333 2334 return extra_columns 2335 2336 def get_extra_infos_sql(self, table: str = None) -> str: 2337 """ 2338 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2339 by double quotes 2340 2341 :param table: The name of the table to get the extra infos from. If None, the default table is 2342 used 2343 :type table: str 2344 :return: A string of the extra infos 2345 """ 2346 2347 return ", ".join( 2348 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2349 ) 2350 2351 def export_header( 2352 self, 2353 header_name: str = None, 2354 output_file: str = None, 2355 output_file_ext: str = ".hdr", 2356 clean_header: bool = True, 2357 remove_chrom_line: bool = False, 2358 ) -> str: 2359 """ 2360 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2361 specified options, and writes it to a new file. 2362 2363 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2364 this parameter is not specified, the header will be written to the output file 2365 :type header_name: str 2366 :param output_file: The `output_file` parameter in the `export_header` function is used to 2367 specify the name of the output file where the header will be written. If this parameter is not 2368 provided, the header will be written to a temporary file 2369 :type output_file: str 2370 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2371 string that represents the extension of the output header file. By default, it is set to ".hdr" 2372 if not specified by the user. This extension will be appended to the `output_file` name to 2373 create the final, defaults to .hdr 2374 :type output_file_ext: str (optional) 2375 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2376 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2377 `True`, the function will clean the header by modifying certain lines based on a specific 2378 pattern. If `clean_header`, defaults to True 2379 :type clean_header: bool (optional) 2380 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2381 boolean flag that determines whether the #CHROM line should be removed from the header before 2382 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2383 defaults to False 2384 :type remove_chrom_line: bool (optional) 2385 :return: The function `export_header` returns the name of the temporary header file that is 2386 created. 2387 """ 2388 2389 if not header_name and not output_file: 2390 output_file = self.get_output() 2391 2392 if self.get_header(): 2393 2394 # Get header object 2395 header_obj = self.get_header() 2396 2397 # Create database 2398 db_for_header = Database(database=self.get_input()) 2399 2400 # Get real columns in the file 2401 db_header_columns = db_for_header.get_columns() 2402 2403 with tempfile.TemporaryDirectory() as tmpdir: 2404 2405 # Write header file 2406 header_file_tmp = os.path.join(tmpdir, "header") 2407 f = open(header_file_tmp, "w") 2408 vcf.Writer(f, header_obj) 2409 f.close() 2410 2411 # Replace #CHROM line with rel columns 2412 header_list = db_for_header.read_header_file( 2413 header_file=header_file_tmp 2414 ) 2415 header_list[-1] = "\t".join(db_header_columns) 2416 2417 # Remove CHROM line 2418 if remove_chrom_line: 2419 header_list.pop() 2420 2421 # Clean header 2422 if clean_header: 2423 header_list_clean = [] 2424 for head in header_list: 2425 # Clean head for malformed header 2426 head_clean = head 2427 head_clean = re.subn( 2428 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2429 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2430 head_clean, 2431 2, 2432 )[0] 2433 # Write header 2434 header_list_clean.append(head_clean) 2435 header_list = header_list_clean 2436 2437 tmp_header_name = output_file + output_file_ext 2438 2439 f = open(tmp_header_name, "w") 2440 for line in header_list: 2441 f.write(line) 2442 f.close() 2443 2444 return tmp_header_name 2445 2446 def export_variant_vcf( 2447 self, 2448 vcf_file, 2449 remove_info: bool = False, 2450 add_samples: bool = True, 2451 list_samples: list = [], 2452 where_clause: str = "", 2453 index: bool = False, 2454 threads: int | None = None, 2455 ) -> bool | None: 2456 """ 2457 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2458 remove INFO field, add samples, and control compression and indexing. 2459 2460 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2461 written to. It is the output file that will contain the filtered VCF data based on the specified 2462 parameters 2463 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2464 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2465 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2466 in, defaults to False 2467 :type remove_info: bool (optional) 2468 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2469 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2470 If set to False, the samples will be removed. The default value is True, defaults to True 2471 :type add_samples: bool (optional) 2472 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2473 in the output VCF file. By default, all samples will be included. If you provide a list of 2474 samples, only those samples will be included in the output file 2475 :type list_samples: list 2476 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2477 determines whether or not to create an index for the output VCF file. If `index` is set to 2478 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2479 :type index: bool (optional) 2480 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2481 number of threads to use for exporting the VCF file. It determines how many parallel threads 2482 will be used during the export process. More threads can potentially speed up the export process 2483 by utilizing multiple cores of the processor. If 2484 :type threads: int | None 2485 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2486 method with various parameters including the output file, query, threads, sort flag, and index 2487 flag. The `export_output` method is responsible for exporting the VCF data based on the 2488 specified parameters and configurations provided in the `export_variant_vcf` function. 2489 """ 2490 2491 # Config 2492 config = self.get_config() 2493 2494 # Extract VCF 2495 log.debug("Export VCF...") 2496 2497 # Table variants 2498 table_variants = self.get_table_variants() 2499 2500 # Threads 2501 if not threads: 2502 threads = self.get_threads() 2503 2504 # Info fields 2505 if remove_info: 2506 if not isinstance(remove_info, str): 2507 remove_info = "." 2508 info_field = f"""'{remove_info}' as INFO""" 2509 else: 2510 info_field = "INFO" 2511 2512 # Samples fields 2513 if add_samples: 2514 if not list_samples: 2515 list_samples = self.get_header_sample_list() 2516 if list_samples: 2517 samples_fields = " , FORMAT , " + " , ".join( 2518 [f""" "{sample}" """ for sample in list_samples] 2519 ) 2520 else: 2521 samples_fields = "" 2522 log.debug(f"samples_fields: {samples_fields}") 2523 else: 2524 samples_fields = "" 2525 2526 # Where clause 2527 if where_clause is None: 2528 where_clause = "" 2529 2530 # Variants 2531 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2532 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2533 log.debug(f"sql_query_select={sql_query_select}") 2534 2535 return self.export_output( 2536 output_file=vcf_file, 2537 output_header=None, 2538 export_header=True, 2539 query=sql_query_select, 2540 parquet_partitions=None, 2541 chunk_size=config.get("chunk_size", None), 2542 threads=threads, 2543 sort=True, 2544 index=index, 2545 order_by=None, 2546 ) 2547 2548 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2549 """ 2550 It takes a list of commands and runs them in parallel using the number of threads specified 2551 2552 :param commands: A list of commands to run 2553 :param threads: The number of threads to use, defaults to 1 (optional) 2554 """ 2555 2556 run_parallel_commands(commands, threads) 2557 2558 def get_threads(self, default: int = 1) -> int: 2559 """ 2560 This function returns the number of threads to use for a job, with a default value of 1 if not 2561 specified. 2562 2563 :param default: The `default` parameter in the `get_threads` method is used to specify the 2564 default number of threads to use if no specific value is provided. If no value is provided for 2565 the `threads` parameter in the configuration or input parameters, the `default` value will be 2566 used, defaults to 1 2567 :type default: int (optional) 2568 :return: the number of threads to use for the current job. 2569 """ 2570 2571 # Config 2572 config = self.get_config() 2573 2574 # Param 2575 param = self.get_param() 2576 2577 # Input threads 2578 input_thread = param.get("threads", config.get("threads", None)) 2579 2580 # Check threads 2581 if not input_thread: 2582 threads = default 2583 elif int(input_thread) <= 0: 2584 threads = os.cpu_count() 2585 else: 2586 threads = int(input_thread) 2587 return threads 2588 2589 def get_memory(self, default: str = None) -> str: 2590 """ 2591 This function retrieves the memory value from parameters or configuration with a default value 2592 if not found. 2593 2594 :param default: The `get_memory` function takes in a default value as a string parameter. This 2595 default value is used as a fallback in case the `memory` parameter is not provided in the 2596 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2597 the function 2598 :type default: str 2599 :return: The `get_memory` function returns a string value representing the memory parameter. If 2600 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2601 return the default value provided as an argument to the function. 2602 """ 2603 2604 # Config 2605 config = self.get_config() 2606 2607 # Param 2608 param = self.get_param() 2609 2610 # Input threads 2611 input_memory = param.get("memory", config.get("memory", None)) 2612 2613 # Check threads 2614 if input_memory: 2615 memory = input_memory 2616 else: 2617 memory = default 2618 2619 return memory 2620 2621 def update_from_vcf(self, vcf_file: str) -> None: 2622 """ 2623 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2624 2625 :param vcf_file: the path to the VCF file 2626 """ 2627 2628 connexion_format = self.get_connexion_format() 2629 2630 if connexion_format in ["duckdb"]: 2631 self.update_from_vcf_duckdb(vcf_file) 2632 elif connexion_format in ["sqlite"]: 2633 self.update_from_vcf_sqlite(vcf_file) 2634 2635 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2636 """ 2637 It takes a VCF file and updates the INFO column of the variants table in the database with the 2638 INFO column of the VCF file 2639 2640 :param vcf_file: the path to the VCF file 2641 """ 2642 2643 # varaints table 2644 table_variants = self.get_table_variants() 2645 2646 # Loading VCF into temporaire table 2647 skip = self.get_header_length(file=vcf_file) 2648 vcf_df = pd.read_csv( 2649 vcf_file, 2650 sep="\t", 2651 engine="c", 2652 skiprows=skip, 2653 header=0, 2654 low_memory=False, 2655 ) 2656 sql_query_update = f""" 2657 UPDATE {table_variants} as table_variants 2658 SET INFO = concat( 2659 CASE 2660 WHEN INFO NOT IN ('', '.') 2661 THEN INFO 2662 ELSE '' 2663 END, 2664 ( 2665 SELECT 2666 concat( 2667 CASE 2668 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2669 THEN ';' 2670 ELSE '' 2671 END 2672 , 2673 CASE 2674 WHEN table_parquet.INFO NOT IN ('','.') 2675 THEN table_parquet.INFO 2676 ELSE '' 2677 END 2678 ) 2679 FROM vcf_df as table_parquet 2680 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2681 AND table_parquet.\"POS\" = table_variants.\"POS\" 2682 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2683 AND table_parquet.\"REF\" = table_variants.\"REF\" 2684 AND table_parquet.INFO NOT IN ('','.') 2685 ) 2686 ) 2687 ; 2688 """ 2689 self.conn.execute(sql_query_update) 2690 2691 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2692 """ 2693 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2694 table, then updates the INFO column of the variants table with the INFO column of the temporary 2695 table 2696 2697 :param vcf_file: The path to the VCF file you want to update the database with 2698 """ 2699 2700 # Create a temporary table for the VCF 2701 table_vcf = "tmp_vcf" 2702 sql_create = ( 2703 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2704 ) 2705 self.conn.execute(sql_create) 2706 2707 # Loading VCF into temporaire table 2708 vcf_df = pd.read_csv( 2709 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2710 ) 2711 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2712 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2713 2714 # Update table 'variants' with VCF data 2715 # warning: CONCAT as || operator 2716 sql_query_update = f""" 2717 UPDATE variants as table_variants 2718 SET INFO = CASE 2719 WHEN INFO NOT IN ('', '.') 2720 THEN INFO 2721 ELSE '' 2722 END || 2723 ( 2724 SELECT 2725 CASE 2726 WHEN table_variants.INFO NOT IN ('','.') 2727 AND table_vcf.INFO NOT IN ('','.') 2728 THEN ';' 2729 ELSE '' 2730 END || 2731 CASE 2732 WHEN table_vcf.INFO NOT IN ('','.') 2733 THEN table_vcf.INFO 2734 ELSE '' 2735 END 2736 FROM {table_vcf} as table_vcf 2737 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2738 AND table_vcf.\"POS\" = table_variants.\"POS\" 2739 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2740 AND table_vcf.\"REF\" = table_variants.\"REF\" 2741 ) 2742 """ 2743 self.conn.execute(sql_query_update) 2744 2745 # Drop temporary table 2746 sql_drop = f"DROP TABLE {table_vcf}" 2747 self.conn.execute(sql_drop) 2748 2749 def drop_variants_table(self) -> None: 2750 """ 2751 > This function drops the variants table 2752 """ 2753 2754 table_variants = self.get_table_variants() 2755 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2756 self.conn.execute(sql_table_variants) 2757 2758 def set_variant_id( 2759 self, variant_id_column: str = "variant_id", force: bool = None 2760 ) -> str: 2761 """ 2762 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2763 `#CHROM`, `POS`, `REF`, and `ALT` columns 2764 2765 :param variant_id_column: The name of the column to be created in the variants table, defaults 2766 to variant_id 2767 :type variant_id_column: str (optional) 2768 :param force: If True, the variant_id column will be created even if it already exists 2769 :type force: bool 2770 :return: The name of the column that contains the variant_id 2771 """ 2772 2773 # Assembly 2774 assembly = self.get_param().get( 2775 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2776 ) 2777 2778 # INFO/Tag prefix 2779 prefix = self.get_explode_infos_prefix() 2780 2781 # Explode INFO/SVTYPE 2782 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2783 2784 # variants table 2785 table_variants = self.get_table_variants() 2786 2787 # variant_id column 2788 if not variant_id_column: 2789 variant_id_column = "variant_id" 2790 2791 # Creta variant_id column 2792 if "variant_id" not in self.get_extra_infos() or force: 2793 2794 # Create column 2795 self.add_column( 2796 table_name=table_variants, 2797 column_name=variant_id_column, 2798 column_type="UBIGINT", 2799 default_value="0", 2800 ) 2801 2802 # Update column 2803 self.conn.execute( 2804 f""" 2805 UPDATE {table_variants} 2806 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2807 """ 2808 ) 2809 2810 # Remove added columns 2811 for added_column in added_columns: 2812 self.drop_column(column=added_column) 2813 2814 # return variant_id column name 2815 return variant_id_column 2816 2817 def get_variant_id_column( 2818 self, variant_id_column: str = "variant_id", force: bool = None 2819 ) -> str: 2820 """ 2821 This function returns the variant_id column name 2822 2823 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2824 defaults to variant_id 2825 :type variant_id_column: str (optional) 2826 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2827 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2828 if it is not already set, or if it is set 2829 :type force: bool 2830 :return: The variant_id column name. 2831 """ 2832 2833 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2834 2835 ### 2836 # Annotation 2837 ### 2838 2839 def scan_databases( 2840 self, 2841 database_formats: list = ["parquet"], 2842 database_releases: list = ["current"], 2843 ) -> dict: 2844 """ 2845 The function `scan_databases` scans for available databases based on specified formats and 2846 releases. 2847 2848 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2849 of the databases to be scanned. In this case, the accepted format is "parquet" 2850 :type database_formats: list ["parquet"] 2851 :param database_releases: The `database_releases` parameter is a list that specifies the 2852 releases of the databases to be scanned. In the provided function, the default value for 2853 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2854 databases that are in the "current" 2855 :type database_releases: list 2856 :return: The function `scan_databases` returns a dictionary containing information about 2857 databases that match the specified formats and releases. 2858 """ 2859 2860 # Config 2861 config = self.get_config() 2862 2863 # Param 2864 param = self.get_param() 2865 2866 # Param - Assembly 2867 assembly = param.get("assembly", config.get("assembly", None)) 2868 if not assembly: 2869 assembly = DEFAULT_ASSEMBLY 2870 log.warning(f"Default assembly '{assembly}'") 2871 2872 # Scan for availabled databases 2873 log.info( 2874 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2875 ) 2876 databases_infos_dict = databases_infos( 2877 database_folder_releases=database_releases, 2878 database_formats=database_formats, 2879 assembly=assembly, 2880 config=config, 2881 ) 2882 log.info( 2883 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2884 ) 2885 2886 return databases_infos_dict 2887 2888 def annotation(self) -> None: 2889 """ 2890 It annotates the VCF file with the annotations specified in the config file. 2891 """ 2892 2893 # Config 2894 config = self.get_config() 2895 2896 # Param 2897 param = self.get_param() 2898 2899 # Param - Assembly 2900 assembly = param.get("assembly", config.get("assembly", None)) 2901 if not assembly: 2902 assembly = DEFAULT_ASSEMBLY 2903 log.warning(f"Default assembly '{assembly}'") 2904 2905 # annotations databases folders 2906 annotations_databases = set( 2907 config.get("folders", {}) 2908 .get("databases", {}) 2909 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2910 + config.get("folders", {}) 2911 .get("databases", {}) 2912 .get("parquet", ["~/howard/databases/parquet/current"]) 2913 + config.get("folders", {}) 2914 .get("databases", {}) 2915 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2916 ) 2917 2918 # Get param annotations 2919 if param.get("annotations", None) and isinstance( 2920 param.get("annotations", None), str 2921 ): 2922 log.debug(param.get("annotations", None)) 2923 param_annotation_list = param.get("annotations").split(",") 2924 else: 2925 param_annotation_list = [] 2926 2927 # Each tools param 2928 if param.get("annotation_parquet", None) != None: 2929 log.debug( 2930 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2931 ) 2932 if isinstance(param.get("annotation_parquet", None), list): 2933 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2934 else: 2935 param_annotation_list.append(param.get("annotation_parquet")) 2936 if param.get("annotation_snpsift", None) != None: 2937 if isinstance(param.get("annotation_snpsift", None), list): 2938 param_annotation_list.append( 2939 "snpsift:" 2940 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2941 ) 2942 else: 2943 param_annotation_list.append( 2944 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2945 ) 2946 if param.get("annotation_snpeff", None) != None: 2947 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2948 if param.get("annotation_bcftools", None) != None: 2949 if isinstance(param.get("annotation_bcftools", None), list): 2950 param_annotation_list.append( 2951 "bcftools:" 2952 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2953 ) 2954 else: 2955 param_annotation_list.append( 2956 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2957 ) 2958 if param.get("annotation_annovar", None) != None: 2959 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2960 if param.get("annotation_exomiser", None) != None: 2961 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2962 if param.get("annotation_splice", None) != None: 2963 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2964 2965 # Merge param annotations list 2966 param["annotations"] = ",".join(param_annotation_list) 2967 2968 # debug 2969 log.debug(f"param_annotations={param['annotations']}") 2970 2971 if param.get("annotations"): 2972 2973 # Log 2974 # log.info("Annotations - Check annotation parameters") 2975 2976 if not "annotation" in param: 2977 param["annotation"] = {} 2978 2979 # List of annotations parameters 2980 annotations_list_input = {} 2981 if isinstance(param.get("annotations", None), str): 2982 annotation_file_list = [ 2983 value for value in param.get("annotations", "").split(",") 2984 ] 2985 for annotation_file in annotation_file_list: 2986 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2987 else: 2988 annotations_list_input = param.get("annotations", {}) 2989 2990 log.info(f"Quick Annotations:") 2991 for annotation_key in list(annotations_list_input.keys()): 2992 log.info(f" {annotation_key}") 2993 2994 # List of annotations and associated fields 2995 annotations_list = {} 2996 2997 for annotation_file in annotations_list_input: 2998 2999 # Explode annotations if ALL 3000 if ( 3001 annotation_file.upper() == "ALL" 3002 or annotation_file.upper().startswith("ALL:") 3003 ): 3004 3005 # check ALL parameters (formats, releases) 3006 annotation_file_split = annotation_file.split(":") 3007 database_formats = "parquet" 3008 database_releases = "current" 3009 for annotation_file_option in annotation_file_split[1:]: 3010 database_all_options_split = annotation_file_option.split("=") 3011 if database_all_options_split[0] == "format": 3012 database_formats = database_all_options_split[1].split("+") 3013 if database_all_options_split[0] == "release": 3014 database_releases = database_all_options_split[1].split("+") 3015 3016 # Scan for availabled databases 3017 databases_infos_dict = self.scan_databases( 3018 database_formats=database_formats, 3019 database_releases=database_releases, 3020 ) 3021 3022 # Add found databases in annotation parameters 3023 for database_infos in databases_infos_dict.keys(): 3024 annotations_list[database_infos] = {"INFO": None} 3025 3026 else: 3027 annotations_list[annotation_file] = annotations_list_input[ 3028 annotation_file 3029 ] 3030 3031 # Check each databases 3032 if len(annotations_list): 3033 3034 log.info( 3035 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3036 ) 3037 3038 for annotation_file in annotations_list: 3039 3040 # Init 3041 annotations = annotations_list.get(annotation_file, None) 3042 3043 # Annotation snpEff 3044 if annotation_file.startswith("snpeff"): 3045 3046 log.debug(f"Quick Annotation snpEff") 3047 3048 if "snpeff" not in param["annotation"]: 3049 param["annotation"]["snpeff"] = {} 3050 3051 if "options" not in param["annotation"]["snpeff"]: 3052 param["annotation"]["snpeff"]["options"] = "" 3053 3054 # snpEff options in annotations 3055 param["annotation"]["snpeff"]["options"] = "".join( 3056 annotation_file.split(":")[1:] 3057 ) 3058 3059 # Annotation Annovar 3060 elif annotation_file.startswith("annovar"): 3061 3062 log.debug(f"Quick Annotation Annovar") 3063 3064 if "annovar" not in param["annotation"]: 3065 param["annotation"]["annovar"] = {} 3066 3067 if "annotations" not in param["annotation"]["annovar"]: 3068 param["annotation"]["annovar"]["annotations"] = {} 3069 3070 # Options 3071 annotation_file_split = annotation_file.split(":") 3072 for annotation_file_annotation in annotation_file_split[1:]: 3073 if annotation_file_annotation: 3074 param["annotation"]["annovar"]["annotations"][ 3075 annotation_file_annotation 3076 ] = annotations 3077 3078 # Annotation Exomiser 3079 elif annotation_file.startswith("exomiser"): 3080 3081 log.debug(f"Quick Annotation Exomiser") 3082 3083 param["annotation"]["exomiser"] = params_string_to_dict( 3084 annotation_file 3085 ) 3086 3087 # Annotation Splice 3088 elif annotation_file.startswith("splice"): 3089 3090 log.debug(f"Quick Annotation Splice") 3091 3092 param["annotation"]["splice"] = params_string_to_dict( 3093 annotation_file 3094 ) 3095 3096 # Annotation Parquet or BCFTOOLS 3097 else: 3098 3099 # Tools detection 3100 if annotation_file.startswith("bcftools:"): 3101 annotation_tool_initial = "bcftools" 3102 annotation_file = ":".join(annotation_file.split(":")[1:]) 3103 elif annotation_file.startswith("snpsift:"): 3104 annotation_tool_initial = "snpsift" 3105 annotation_file = ":".join(annotation_file.split(":")[1:]) 3106 elif annotation_file.startswith("bigwig:"): 3107 annotation_tool_initial = "bigwig" 3108 annotation_file = ":".join(annotation_file.split(":")[1:]) 3109 else: 3110 annotation_tool_initial = None 3111 3112 # list of files 3113 annotation_file_list = annotation_file.replace("+", ":").split( 3114 ":" 3115 ) 3116 3117 for annotation_file in annotation_file_list: 3118 3119 if annotation_file: 3120 3121 # Annotation tool initial 3122 annotation_tool = annotation_tool_initial 3123 3124 # Find file 3125 annotation_file_found = None 3126 3127 if os.path.exists(annotation_file): 3128 annotation_file_found = annotation_file 3129 elif os.path.exists(full_path(annotation_file)): 3130 annotation_file_found = full_path(annotation_file) 3131 else: 3132 # Find within assembly folders 3133 for annotations_database in annotations_databases: 3134 found_files = find_all( 3135 annotation_file, 3136 os.path.join( 3137 annotations_database, assembly 3138 ), 3139 ) 3140 if len(found_files) > 0: 3141 annotation_file_found = found_files[0] 3142 break 3143 if not annotation_file_found and not assembly: 3144 # Find within folders 3145 for ( 3146 annotations_database 3147 ) in annotations_databases: 3148 found_files = find_all( 3149 annotation_file, annotations_database 3150 ) 3151 if len(found_files) > 0: 3152 annotation_file_found = found_files[0] 3153 break 3154 log.debug( 3155 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3156 ) 3157 3158 # Full path 3159 annotation_file_found = full_path(annotation_file_found) 3160 3161 if annotation_file_found: 3162 3163 database = Database(database=annotation_file_found) 3164 quick_annotation_format = database.get_format() 3165 quick_annotation_is_compressed = ( 3166 database.is_compressed() 3167 ) 3168 quick_annotation_is_indexed = os.path.exists( 3169 f"{annotation_file_found}.tbi" 3170 ) 3171 bcftools_preference = False 3172 3173 # Check Annotation Tool 3174 if not annotation_tool: 3175 if ( 3176 bcftools_preference 3177 and quick_annotation_format 3178 in ["vcf", "bed"] 3179 and quick_annotation_is_compressed 3180 and quick_annotation_is_indexed 3181 ): 3182 annotation_tool = "bcftools" 3183 elif quick_annotation_format in [ 3184 "vcf", 3185 "bed", 3186 "tsv", 3187 "tsv", 3188 "csv", 3189 "json", 3190 "tbl", 3191 "parquet", 3192 "duckdb", 3193 ]: 3194 annotation_tool = "parquet" 3195 elif quick_annotation_format in ["bw"]: 3196 annotation_tool = "bigwig" 3197 else: 3198 log.error( 3199 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3200 ) 3201 raise ValueError( 3202 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3203 ) 3204 3205 log.debug( 3206 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3207 ) 3208 3209 # Annotation Tool dispatch 3210 if annotation_tool: 3211 if annotation_tool not in param["annotation"]: 3212 param["annotation"][annotation_tool] = {} 3213 if ( 3214 "annotations" 3215 not in param["annotation"][annotation_tool] 3216 ): 3217 param["annotation"][annotation_tool][ 3218 "annotations" 3219 ] = {} 3220 param["annotation"][annotation_tool][ 3221 "annotations" 3222 ][annotation_file_found] = annotations 3223 3224 else: 3225 log.warning( 3226 f"Quick Annotation File {annotation_file} does NOT exist" 3227 ) 3228 3229 self.set_param(param) 3230 3231 if param.get("annotation", None): 3232 log.info("Annotations") 3233 if param.get("annotation", {}).get("parquet", None): 3234 log.info("Annotations 'parquet'...") 3235 self.annotation_parquet() 3236 if param.get("annotation", {}).get("bcftools", None): 3237 log.info("Annotations 'bcftools'...") 3238 self.annotation_bcftools() 3239 if param.get("annotation", {}).get("snpsift", None): 3240 log.info("Annotations 'snpsift'...") 3241 self.annotation_snpsift() 3242 if param.get("annotation", {}).get("bigwig", None): 3243 log.info("Annotations 'bigwig'...") 3244 self.annotation_bigwig() 3245 if param.get("annotation", {}).get("annovar", None): 3246 log.info("Annotations 'annovar'...") 3247 self.annotation_annovar() 3248 if param.get("annotation", {}).get("snpeff", None): 3249 log.info("Annotations 'snpeff'...") 3250 self.annotation_snpeff() 3251 if param.get("annotation", {}).get("exomiser", None) is not None: 3252 log.info("Annotations 'exomiser'...") 3253 self.annotation_exomiser() 3254 if param.get("annotation", {}).get("splice", None) is not None: 3255 log.info("Annotations 'splice' ...") 3256 self.annotation_splice() 3257 3258 # Explode INFOS fields into table fields 3259 if self.get_explode_infos(): 3260 self.explode_infos( 3261 prefix=self.get_explode_infos_prefix(), 3262 fields=self.get_explode_infos_fields(), 3263 force=True, 3264 ) 3265 3266 def annotation_bigwig(self, threads: int = None) -> None: 3267 """ 3268 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3269 3270 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3271 number of threads to be used for parallel processing during the annotation process. If the 3272 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3273 threads to use based on the system configuration 3274 :type threads: int 3275 :return: True 3276 """ 3277 3278 # DEBUG 3279 log.debug("Start annotation with bigwig databases") 3280 3281 # # Threads 3282 # if not threads: 3283 # threads = self.get_threads() 3284 # log.debug("Threads: " + str(threads)) 3285 3286 # Config 3287 config = self.get_config() 3288 log.debug("Config: " + str(config)) 3289 3290 # Config - BCFTools databases folders 3291 databases_folders = set( 3292 self.get_config() 3293 .get("folders", {}) 3294 .get("databases", {}) 3295 .get("annotations", ["."]) 3296 + self.get_config() 3297 .get("folders", {}) 3298 .get("databases", {}) 3299 .get("bigwig", ["."]) 3300 ) 3301 log.debug("Databases annotations: " + str(databases_folders)) 3302 3303 # Param 3304 annotations = ( 3305 self.get_param() 3306 .get("annotation", {}) 3307 .get("bigwig", {}) 3308 .get("annotations", None) 3309 ) 3310 log.debug("Annotations: " + str(annotations)) 3311 3312 # Assembly 3313 assembly = self.get_param().get( 3314 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3315 ) 3316 3317 # Data 3318 table_variants = self.get_table_variants() 3319 3320 # Check if not empty 3321 log.debug("Check if not empty") 3322 sql_query_chromosomes = ( 3323 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3324 ) 3325 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3326 if not sql_query_chromosomes_df["count"][0]: 3327 log.info(f"VCF empty") 3328 return 3329 3330 # VCF header 3331 vcf_reader = self.get_header() 3332 log.debug("Initial header: " + str(vcf_reader.infos)) 3333 3334 # Existing annotations 3335 for vcf_annotation in self.get_header().infos: 3336 3337 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3338 log.debug( 3339 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3340 ) 3341 3342 if annotations: 3343 3344 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3345 3346 # Export VCF file 3347 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3348 3349 # annotation_bigwig_config 3350 annotation_bigwig_config_list = [] 3351 3352 for annotation in annotations: 3353 annotation_fields = annotations[annotation] 3354 3355 # Annotation Name 3356 annotation_name = os.path.basename(annotation) 3357 3358 if not annotation_fields: 3359 annotation_fields = {"INFO": None} 3360 3361 log.debug(f"Annotation '{annotation_name}'") 3362 log.debug( 3363 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3364 ) 3365 3366 # Create Database 3367 database = Database( 3368 database=annotation, 3369 databases_folders=databases_folders, 3370 assembly=assembly, 3371 ) 3372 3373 # Find files 3374 db_file = database.get_database() 3375 db_file = full_path(db_file) 3376 db_hdr_file = database.get_header_file() 3377 db_hdr_file = full_path(db_hdr_file) 3378 db_file_type = database.get_format() 3379 3380 # If db_file is http ? 3381 if database.get_database().startswith("http"): 3382 3383 # Datbase is HTTP URL 3384 db_file_is_http = True 3385 3386 # DB file keep as URL 3387 db_file = database.get_database() 3388 log.warning( 3389 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3390 ) 3391 3392 # Retrieve automatic annotation field name 3393 annotation_field = clean_annotation_field( 3394 os.path.basename(db_file).replace(".bw", "") 3395 ) 3396 log.debug( 3397 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3398 ) 3399 3400 # Create automatic header file 3401 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3402 with open(db_hdr_file, "w") as f: 3403 f.write("##fileformat=VCFv4.2\n") 3404 f.write( 3405 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3406 ) 3407 f.write(f"#CHROM START END {annotation_field}\n") 3408 3409 else: 3410 3411 # Datbase is NOT HTTP URL 3412 db_file_is_http = False 3413 3414 # Check index - try to create if not exists 3415 if ( 3416 db_file is None 3417 or db_hdr_file is None 3418 or (not os.path.exists(db_file) and not db_file_is_http) 3419 or not os.path.exists(db_hdr_file) 3420 or not db_file_type in ["bw"] 3421 ): 3422 # if False: 3423 log.error("Annotation failed: database not valid") 3424 log.error(f"Annotation annotation file: {db_file}") 3425 log.error(f"Annotation annotation file type: {db_file_type}") 3426 log.error(f"Annotation annotation header: {db_hdr_file}") 3427 raise ValueError( 3428 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3429 ) 3430 else: 3431 3432 # Log 3433 log.debug( 3434 f"Annotation '{annotation}' - file: " 3435 + str(db_file) 3436 + " and " 3437 + str(db_hdr_file) 3438 ) 3439 3440 # Load header as VCF object 3441 db_hdr_vcf = Variants(input=db_hdr_file) 3442 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3443 log.debug( 3444 "Annotation database header: " 3445 + str(db_hdr_vcf_header_infos) 3446 ) 3447 3448 # For all fields in database 3449 annotation_fields_full = False 3450 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3451 annotation_fields = { 3452 key: key for key in db_hdr_vcf_header_infos 3453 } 3454 log.debug( 3455 "Annotation database header - All annotations added: " 3456 + str(annotation_fields) 3457 ) 3458 annotation_fields_full = True 3459 3460 # Init 3461 cyvcf2_header_rename_dict = {} 3462 cyvcf2_header_list = [] 3463 cyvcf2_header_indexes = {} 3464 3465 # process annotation fields 3466 for annotation_field in annotation_fields: 3467 3468 # New annotation name 3469 annotation_field_new = annotation_fields[annotation_field] 3470 3471 # Check annotation field and index in header 3472 if ( 3473 annotation_field 3474 in db_hdr_vcf.get_header_columns_as_list() 3475 ): 3476 annotation_field_index = ( 3477 db_hdr_vcf.get_header_columns_as_list().index( 3478 annotation_field 3479 ) 3480 - 3 3481 ) 3482 cyvcf2_header_indexes[annotation_field_new] = ( 3483 annotation_field_index 3484 ) 3485 else: 3486 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3487 log.error(msg_err) 3488 raise ValueError(msg_err) 3489 3490 # Append annotation field in cyvcf2 header list 3491 cyvcf2_header_rename_dict[annotation_field_new] = ( 3492 db_hdr_vcf_header_infos[annotation_field].id 3493 ) 3494 cyvcf2_header_list.append( 3495 { 3496 "ID": annotation_field_new, 3497 "Number": db_hdr_vcf_header_infos[ 3498 annotation_field 3499 ].num, 3500 "Type": db_hdr_vcf_header_infos[ 3501 annotation_field 3502 ].type, 3503 "Description": db_hdr_vcf_header_infos[ 3504 annotation_field 3505 ].desc, 3506 } 3507 ) 3508 3509 # Add header on VCF 3510 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3511 annotation_field_new, 3512 db_hdr_vcf_header_infos[annotation_field].num, 3513 db_hdr_vcf_header_infos[annotation_field].type, 3514 db_hdr_vcf_header_infos[annotation_field].desc, 3515 "HOWARD BigWig annotation", 3516 "unknown", 3517 self.code_type_map[ 3518 db_hdr_vcf_header_infos[annotation_field].type 3519 ], 3520 ) 3521 3522 # Load bigwig database 3523 bw_db = pyBigWig.open(db_file) 3524 if bw_db.isBigWig(): 3525 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3526 else: 3527 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3528 log.error(msg_err) 3529 raise ValueError(msg_err) 3530 3531 annotation_bigwig_config_list.append( 3532 { 3533 "db_file": db_file, 3534 "bw_db": bw_db, 3535 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3536 "cyvcf2_header_list": cyvcf2_header_list, 3537 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3538 } 3539 ) 3540 3541 # Annotate 3542 if annotation_bigwig_config_list: 3543 3544 # Annotation config 3545 log.debug( 3546 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3547 ) 3548 3549 # Export VCF file 3550 self.export_variant_vcf( 3551 vcf_file=tmp_vcf_name, 3552 remove_info=True, 3553 add_samples=False, 3554 index=True, 3555 ) 3556 3557 # Load input tmp file 3558 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3559 3560 # Add header in input file 3561 for annotation_bigwig_config in annotation_bigwig_config_list: 3562 for cyvcf2_header_field in annotation_bigwig_config.get( 3563 "cyvcf2_header_list", [] 3564 ): 3565 log.info( 3566 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3567 ) 3568 input_vcf.add_info_to_header(cyvcf2_header_field) 3569 3570 # Create output VCF file 3571 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3572 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3573 3574 # Fetch variants 3575 log.info(f"Annotations 'bigwig' start...") 3576 for variant in input_vcf: 3577 3578 for annotation_bigwig_config in annotation_bigwig_config_list: 3579 3580 # DB and indexes 3581 bw_db = annotation_bigwig_config.get("bw_db", None) 3582 cyvcf2_header_indexes = annotation_bigwig_config.get( 3583 "cyvcf2_header_indexes", None 3584 ) 3585 3586 # Retrieve value from chrom pos 3587 res = bw_db.values( 3588 variant.CHROM, variant.POS - 1, variant.POS 3589 ) 3590 3591 # For each annotation fields (and indexes) 3592 for cyvcf2_header_index in cyvcf2_header_indexes: 3593 3594 # If value is NOT nNone 3595 if not np.isnan( 3596 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3597 ): 3598 variant.INFO[cyvcf2_header_index] = res[ 3599 cyvcf2_header_indexes[cyvcf2_header_index] 3600 ] 3601 3602 # Add record in output file 3603 output_vcf.write_record(variant) 3604 3605 # Log 3606 log.debug(f"Annotation done.") 3607 3608 # Close and write file 3609 log.info(f"Annotations 'bigwig' write...") 3610 output_vcf.close() 3611 log.debug(f"Write done.") 3612 3613 # Update variants 3614 log.info(f"Annotations 'bigwig' update...") 3615 self.update_from_vcf(output_vcf_file) 3616 log.debug(f"Update done.") 3617 3618 return True 3619 3620 def annotation_snpsift(self, threads: int = None) -> None: 3621 """ 3622 This function annotate with bcftools 3623 3624 :param threads: Number of threads to use 3625 :return: the value of the variable "return_value". 3626 """ 3627 3628 # DEBUG 3629 log.debug("Start annotation with bcftools databases") 3630 3631 # Threads 3632 if not threads: 3633 threads = self.get_threads() 3634 log.debug("Threads: " + str(threads)) 3635 3636 # Config 3637 config = self.get_config() 3638 log.debug("Config: " + str(config)) 3639 3640 # Config - snpSift 3641 snpsift_bin_command = get_bin_command( 3642 bin="SnpSift.jar", 3643 tool="snpsift", 3644 bin_type="jar", 3645 config=config, 3646 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3647 ) 3648 if not snpsift_bin_command: 3649 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3650 log.error(msg_err) 3651 raise ValueError(msg_err) 3652 3653 # Config - bcftools 3654 bcftools_bin_command = get_bin_command( 3655 bin="bcftools", 3656 tool="bcftools", 3657 bin_type="bin", 3658 config=config, 3659 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3660 ) 3661 if not bcftools_bin_command: 3662 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3663 log.error(msg_err) 3664 raise ValueError(msg_err) 3665 3666 # Config - BCFTools databases folders 3667 databases_folders = set( 3668 self.get_config() 3669 .get("folders", {}) 3670 .get("databases", {}) 3671 .get("annotations", ["."]) 3672 + self.get_config() 3673 .get("folders", {}) 3674 .get("databases", {}) 3675 .get("bcftools", ["."]) 3676 ) 3677 log.debug("Databases annotations: " + str(databases_folders)) 3678 3679 # Param 3680 annotations = ( 3681 self.get_param() 3682 .get("annotation", {}) 3683 .get("snpsift", {}) 3684 .get("annotations", None) 3685 ) 3686 log.debug("Annotations: " + str(annotations)) 3687 3688 # Assembly 3689 assembly = self.get_param().get( 3690 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3691 ) 3692 3693 # Data 3694 table_variants = self.get_table_variants() 3695 3696 # Check if not empty 3697 log.debug("Check if not empty") 3698 sql_query_chromosomes = ( 3699 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3700 ) 3701 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3702 if not sql_query_chromosomes_df["count"][0]: 3703 log.info(f"VCF empty") 3704 return 3705 3706 # VCF header 3707 vcf_reader = self.get_header() 3708 log.debug("Initial header: " + str(vcf_reader.infos)) 3709 3710 # Existing annotations 3711 for vcf_annotation in self.get_header().infos: 3712 3713 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3714 log.debug( 3715 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3716 ) 3717 3718 if annotations: 3719 3720 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3721 3722 # Export VCF file 3723 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3724 3725 # Init 3726 commands = {} 3727 3728 for annotation in annotations: 3729 annotation_fields = annotations[annotation] 3730 3731 # Annotation Name 3732 annotation_name = os.path.basename(annotation) 3733 3734 if not annotation_fields: 3735 annotation_fields = {"INFO": None} 3736 3737 log.debug(f"Annotation '{annotation_name}'") 3738 log.debug( 3739 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3740 ) 3741 3742 # Create Database 3743 database = Database( 3744 database=annotation, 3745 databases_folders=databases_folders, 3746 assembly=assembly, 3747 ) 3748 3749 # Find files 3750 db_file = database.get_database() 3751 db_file = full_path(db_file) 3752 db_hdr_file = database.get_header_file() 3753 db_hdr_file = full_path(db_hdr_file) 3754 db_file_type = database.get_format() 3755 db_tbi_file = f"{db_file}.tbi" 3756 db_file_compressed = database.is_compressed() 3757 3758 # Check if compressed 3759 if not db_file_compressed: 3760 log.error( 3761 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3762 ) 3763 raise ValueError( 3764 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3765 ) 3766 3767 # Check if indexed 3768 if not os.path.exists(db_tbi_file): 3769 log.error( 3770 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3771 ) 3772 raise ValueError( 3773 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3774 ) 3775 3776 # Check index - try to create if not exists 3777 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3778 log.error("Annotation failed: database not valid") 3779 log.error(f"Annotation annotation file: {db_file}") 3780 log.error(f"Annotation annotation header: {db_hdr_file}") 3781 log.error(f"Annotation annotation index: {db_tbi_file}") 3782 raise ValueError( 3783 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3784 ) 3785 else: 3786 3787 log.debug( 3788 f"Annotation '{annotation}' - file: " 3789 + str(db_file) 3790 + " and " 3791 + str(db_hdr_file) 3792 ) 3793 3794 # Load header as VCF object 3795 db_hdr_vcf = Variants(input=db_hdr_file) 3796 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3797 log.debug( 3798 "Annotation database header: " 3799 + str(db_hdr_vcf_header_infos) 3800 ) 3801 3802 # For all fields in database 3803 annotation_fields_full = False 3804 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3805 annotation_fields = { 3806 key: key for key in db_hdr_vcf_header_infos 3807 } 3808 log.debug( 3809 "Annotation database header - All annotations added: " 3810 + str(annotation_fields) 3811 ) 3812 annotation_fields_full = True 3813 3814 # # Create file for field rename 3815 # log.debug("Create file for field rename") 3816 # tmp_rename = NamedTemporaryFile( 3817 # prefix=self.get_prefix(), 3818 # dir=self.get_tmp_dir(), 3819 # suffix=".rename", 3820 # delete=False, 3821 # ) 3822 # tmp_rename_name = tmp_rename.name 3823 # tmp_files.append(tmp_rename_name) 3824 3825 # Number of fields 3826 nb_annotation_field = 0 3827 annotation_list = [] 3828 annotation_infos_rename_list = [] 3829 3830 for annotation_field in annotation_fields: 3831 3832 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3833 annotation_fields_new_name = annotation_fields.get( 3834 annotation_field, annotation_field 3835 ) 3836 if not annotation_fields_new_name: 3837 annotation_fields_new_name = annotation_field 3838 3839 # Check if field is in DB and if field is not elready in input data 3840 if ( 3841 annotation_field in db_hdr_vcf.get_header().infos 3842 and annotation_fields_new_name 3843 not in self.get_header().infos 3844 ): 3845 3846 log.info( 3847 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3848 ) 3849 3850 # BCFTools annotate param to rename fields 3851 if annotation_field != annotation_fields_new_name: 3852 annotation_infos_rename_list.append( 3853 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3854 ) 3855 3856 # Add INFO field to header 3857 db_hdr_vcf_header_infos_number = ( 3858 db_hdr_vcf_header_infos[annotation_field].num or "." 3859 ) 3860 db_hdr_vcf_header_infos_type = ( 3861 db_hdr_vcf_header_infos[annotation_field].type 3862 or "String" 3863 ) 3864 db_hdr_vcf_header_infos_description = ( 3865 db_hdr_vcf_header_infos[annotation_field].desc 3866 or f"{annotation_field} description" 3867 ) 3868 db_hdr_vcf_header_infos_source = ( 3869 db_hdr_vcf_header_infos[annotation_field].source 3870 or "unknown" 3871 ) 3872 db_hdr_vcf_header_infos_version = ( 3873 db_hdr_vcf_header_infos[annotation_field].version 3874 or "unknown" 3875 ) 3876 3877 vcf_reader.infos[annotation_fields_new_name] = ( 3878 vcf.parser._Info( 3879 annotation_fields_new_name, 3880 db_hdr_vcf_header_infos_number, 3881 db_hdr_vcf_header_infos_type, 3882 db_hdr_vcf_header_infos_description, 3883 db_hdr_vcf_header_infos_source, 3884 db_hdr_vcf_header_infos_version, 3885 self.code_type_map[ 3886 db_hdr_vcf_header_infos_type 3887 ], 3888 ) 3889 ) 3890 3891 annotation_list.append(annotation_field) 3892 3893 nb_annotation_field += 1 3894 3895 else: 3896 3897 if ( 3898 annotation_field 3899 not in db_hdr_vcf.get_header().infos 3900 ): 3901 log.warning( 3902 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3903 ) 3904 if ( 3905 annotation_fields_new_name 3906 in self.get_header().infos 3907 ): 3908 log.warning( 3909 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3910 ) 3911 3912 log.info( 3913 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3914 ) 3915 3916 annotation_infos = ",".join(annotation_list) 3917 3918 if annotation_infos != "": 3919 3920 # Annotated VCF (and error file) 3921 tmp_annotation_vcf_name = os.path.join( 3922 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3923 ) 3924 tmp_annotation_vcf_name_err = ( 3925 tmp_annotation_vcf_name + ".err" 3926 ) 3927 3928 # Add fields to annotate 3929 if not annotation_fields_full: 3930 annotation_infos_option = f"-info {annotation_infos}" 3931 else: 3932 annotation_infos_option = "" 3933 3934 # Info fields rename 3935 if annotation_infos_rename_list: 3936 annotation_infos_rename = " -c " + ",".join( 3937 annotation_infos_rename_list 3938 ) 3939 else: 3940 annotation_infos_rename = "" 3941 3942 # Annotate command 3943 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3944 3945 # Add command 3946 commands[command_annotate] = tmp_annotation_vcf_name 3947 3948 if commands: 3949 3950 # Export VCF file 3951 self.export_variant_vcf( 3952 vcf_file=tmp_vcf_name, 3953 remove_info=True, 3954 add_samples=False, 3955 index=True, 3956 ) 3957 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3958 3959 # Num command 3960 nb_command = 0 3961 3962 # Annotate 3963 for command_annotate in commands: 3964 nb_command += 1 3965 log.info( 3966 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3967 ) 3968 log.debug(f"command_annotate={command_annotate}") 3969 run_parallel_commands([command_annotate], threads) 3970 3971 # Debug 3972 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3973 3974 # Update variants 3975 log.info( 3976 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3977 ) 3978 self.update_from_vcf(commands[command_annotate]) 3979 3980 def annotation_bcftools(self, threads: int = None) -> None: 3981 """ 3982 This function annotate with bcftools 3983 3984 :param threads: Number of threads to use 3985 :return: the value of the variable "return_value". 3986 """ 3987 3988 # DEBUG 3989 log.debug("Start annotation with bcftools databases") 3990 3991 # Threads 3992 if not threads: 3993 threads = self.get_threads() 3994 log.debug("Threads: " + str(threads)) 3995 3996 # Config 3997 config = self.get_config() 3998 log.debug("Config: " + str(config)) 3999 4000 # DEBUG 4001 delete_tmp = True 4002 if self.get_config().get("verbosity", "warning") in ["debug"]: 4003 delete_tmp = False 4004 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4005 4006 # Config - BCFTools bin command 4007 bcftools_bin_command = get_bin_command( 4008 bin="bcftools", 4009 tool="bcftools", 4010 bin_type="bin", 4011 config=config, 4012 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4013 ) 4014 if not bcftools_bin_command: 4015 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4016 log.error(msg_err) 4017 raise ValueError(msg_err) 4018 4019 # Config - BCFTools databases folders 4020 databases_folders = set( 4021 self.get_config() 4022 .get("folders", {}) 4023 .get("databases", {}) 4024 .get("annotations", ["."]) 4025 + self.get_config() 4026 .get("folders", {}) 4027 .get("databases", {}) 4028 .get("bcftools", ["."]) 4029 ) 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Param 4033 annotations = ( 4034 self.get_param() 4035 .get("annotation", {}) 4036 .get("bcftools", {}) 4037 .get("annotations", None) 4038 ) 4039 log.debug("Annotations: " + str(annotations)) 4040 4041 # Assembly 4042 assembly = self.get_param().get( 4043 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4044 ) 4045 4046 # Data 4047 table_variants = self.get_table_variants() 4048 4049 # Check if not empty 4050 log.debug("Check if not empty") 4051 sql_query_chromosomes = ( 4052 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4053 ) 4054 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4055 if not sql_query_chromosomes_df["count"][0]: 4056 log.info(f"VCF empty") 4057 return 4058 4059 # Export in VCF 4060 log.debug("Create initial file to annotate") 4061 tmp_vcf = NamedTemporaryFile( 4062 prefix=self.get_prefix(), 4063 dir=self.get_tmp_dir(), 4064 suffix=".vcf.gz", 4065 delete=False, 4066 ) 4067 tmp_vcf_name = tmp_vcf.name 4068 4069 # VCF header 4070 vcf_reader = self.get_header() 4071 log.debug("Initial header: " + str(vcf_reader.infos)) 4072 4073 # Existing annotations 4074 for vcf_annotation in self.get_header().infos: 4075 4076 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4077 log.debug( 4078 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4079 ) 4080 4081 if annotations: 4082 4083 tmp_ann_vcf_list = [] 4084 commands = [] 4085 tmp_files = [] 4086 err_files = [] 4087 4088 for annotation in annotations: 4089 annotation_fields = annotations[annotation] 4090 4091 # Annotation Name 4092 annotation_name = os.path.basename(annotation) 4093 4094 if not annotation_fields: 4095 annotation_fields = {"INFO": None} 4096 4097 log.debug(f"Annotation '{annotation_name}'") 4098 log.debug( 4099 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4100 ) 4101 4102 # Create Database 4103 database = Database( 4104 database=annotation, 4105 databases_folders=databases_folders, 4106 assembly=assembly, 4107 ) 4108 4109 # Find files 4110 db_file = database.get_database() 4111 db_file = full_path(db_file) 4112 db_hdr_file = database.get_header_file() 4113 db_hdr_file = full_path(db_hdr_file) 4114 db_file_type = database.get_format() 4115 db_tbi_file = f"{db_file}.tbi" 4116 db_file_compressed = database.is_compressed() 4117 4118 # Check if compressed 4119 if not db_file_compressed: 4120 log.error( 4121 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4122 ) 4123 raise ValueError( 4124 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4125 ) 4126 4127 # Check if indexed 4128 if not os.path.exists(db_tbi_file): 4129 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4130 raise ValueError( 4131 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4132 ) 4133 4134 # Check index - try to create if not exists 4135 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4136 log.error("Annotation failed: database not valid") 4137 log.error(f"Annotation annotation file: {db_file}") 4138 log.error(f"Annotation annotation header: {db_hdr_file}") 4139 log.error(f"Annotation annotation index: {db_tbi_file}") 4140 raise ValueError( 4141 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4142 ) 4143 else: 4144 4145 log.debug( 4146 f"Annotation '{annotation}' - file: " 4147 + str(db_file) 4148 + " and " 4149 + str(db_hdr_file) 4150 ) 4151 4152 # Load header as VCF object 4153 db_hdr_vcf = Variants(input=db_hdr_file) 4154 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4155 log.debug( 4156 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4157 ) 4158 4159 # For all fields in database 4160 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4161 annotation_fields = { 4162 key: key for key in db_hdr_vcf_header_infos 4163 } 4164 log.debug( 4165 "Annotation database header - All annotations added: " 4166 + str(annotation_fields) 4167 ) 4168 4169 # Number of fields 4170 nb_annotation_field = 0 4171 annotation_list = [] 4172 4173 for annotation_field in annotation_fields: 4174 4175 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4176 annotation_fields_new_name = annotation_fields.get( 4177 annotation_field, annotation_field 4178 ) 4179 if not annotation_fields_new_name: 4180 annotation_fields_new_name = annotation_field 4181 4182 # Check if field is in DB and if field is not elready in input data 4183 if ( 4184 annotation_field in db_hdr_vcf.get_header().infos 4185 and annotation_fields_new_name 4186 not in self.get_header().infos 4187 ): 4188 4189 log.info( 4190 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4191 ) 4192 4193 # Add INFO field to header 4194 db_hdr_vcf_header_infos_number = ( 4195 db_hdr_vcf_header_infos[annotation_field].num or "." 4196 ) 4197 db_hdr_vcf_header_infos_type = ( 4198 db_hdr_vcf_header_infos[annotation_field].type 4199 or "String" 4200 ) 4201 db_hdr_vcf_header_infos_description = ( 4202 db_hdr_vcf_header_infos[annotation_field].desc 4203 or f"{annotation_field} description" 4204 ) 4205 db_hdr_vcf_header_infos_source = ( 4206 db_hdr_vcf_header_infos[annotation_field].source 4207 or "unknown" 4208 ) 4209 db_hdr_vcf_header_infos_version = ( 4210 db_hdr_vcf_header_infos[annotation_field].version 4211 or "unknown" 4212 ) 4213 4214 vcf_reader.infos[annotation_fields_new_name] = ( 4215 vcf.parser._Info( 4216 annotation_fields_new_name, 4217 db_hdr_vcf_header_infos_number, 4218 db_hdr_vcf_header_infos_type, 4219 db_hdr_vcf_header_infos_description, 4220 db_hdr_vcf_header_infos_source, 4221 db_hdr_vcf_header_infos_version, 4222 self.code_type_map[db_hdr_vcf_header_infos_type], 4223 ) 4224 ) 4225 4226 # annotation_list.append(annotation_field) 4227 if annotation_field != annotation_fields_new_name: 4228 annotation_list.append( 4229 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4230 ) 4231 else: 4232 annotation_list.append(annotation_field) 4233 4234 nb_annotation_field += 1 4235 4236 else: 4237 4238 if annotation_field not in db_hdr_vcf.get_header().infos: 4239 log.warning( 4240 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4241 ) 4242 if annotation_fields_new_name in self.get_header().infos: 4243 log.warning( 4244 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4245 ) 4246 4247 log.info( 4248 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4249 ) 4250 4251 annotation_infos = ",".join(annotation_list) 4252 4253 if annotation_infos != "": 4254 4255 # Protect header for bcftools (remove "#CHROM" and variants line) 4256 log.debug("Protect Header file - remove #CHROM line if exists") 4257 tmp_header_vcf = NamedTemporaryFile( 4258 prefix=self.get_prefix(), 4259 dir=self.get_tmp_dir(), 4260 suffix=".hdr", 4261 delete=False, 4262 ) 4263 tmp_header_vcf_name = tmp_header_vcf.name 4264 tmp_files.append(tmp_header_vcf_name) 4265 # Command 4266 if db_hdr_file.endswith(".gz"): 4267 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4268 else: 4269 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4270 # Run 4271 run_parallel_commands([command_extract_header], 1) 4272 4273 # Find chomosomes 4274 log.debug("Find chromosomes ") 4275 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4276 sql_query_chromosomes_df = self.get_query_to_df( 4277 sql_query_chromosomes 4278 ) 4279 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4280 4281 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4282 4283 # BED columns in the annotation file 4284 if db_file_type in ["bed"]: 4285 annotation_infos = "CHROM,POS,POS," + annotation_infos 4286 4287 for chrom in chomosomes_list: 4288 4289 # Create BED on initial VCF 4290 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4291 tmp_bed = NamedTemporaryFile( 4292 prefix=self.get_prefix(), 4293 dir=self.get_tmp_dir(), 4294 suffix=".bed", 4295 delete=False, 4296 ) 4297 tmp_bed_name = tmp_bed.name 4298 tmp_files.append(tmp_bed_name) 4299 4300 # Detecte regions 4301 log.debug( 4302 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4303 ) 4304 window = 1000000 4305 sql_query_intervals_for_bed = f""" 4306 SELECT \"#CHROM\", 4307 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4308 \"POS\"+{window} 4309 FROM {table_variants} as table_variants 4310 WHERE table_variants.\"#CHROM\" = '{chrom}' 4311 """ 4312 regions = self.conn.execute( 4313 sql_query_intervals_for_bed 4314 ).fetchall() 4315 merged_regions = merge_regions(regions) 4316 log.debug( 4317 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4318 ) 4319 4320 header = ["#CHROM", "START", "END"] 4321 with open(tmp_bed_name, "w") as f: 4322 # Write the header with tab delimiter 4323 f.write("\t".join(header) + "\n") 4324 for d in merged_regions: 4325 # Write each data row with tab delimiter 4326 f.write("\t".join(map(str, d)) + "\n") 4327 4328 # Tmp files 4329 tmp_annotation_vcf = NamedTemporaryFile( 4330 prefix=self.get_prefix(), 4331 dir=self.get_tmp_dir(), 4332 suffix=".vcf.gz", 4333 delete=False, 4334 ) 4335 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4336 tmp_files.append(tmp_annotation_vcf_name) 4337 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4338 tmp_annotation_vcf_name_err = ( 4339 tmp_annotation_vcf_name + ".err" 4340 ) 4341 err_files.append(tmp_annotation_vcf_name_err) 4342 4343 # Annotate Command 4344 log.debug( 4345 f"Annotation '{annotation}' - add bcftools command" 4346 ) 4347 4348 # Command 4349 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4350 4351 # Add command 4352 commands.append(command_annotate) 4353 4354 # if some commands 4355 if commands: 4356 4357 # Export VCF file 4358 self.export_variant_vcf( 4359 vcf_file=tmp_vcf_name, 4360 remove_info=True, 4361 add_samples=False, 4362 index=True, 4363 ) 4364 4365 # Threads 4366 # calculate threads for annotated commands 4367 if commands: 4368 threads_bcftools_annotate = round(threads / len(commands)) 4369 else: 4370 threads_bcftools_annotate = 1 4371 4372 if not threads_bcftools_annotate: 4373 threads_bcftools_annotate = 1 4374 4375 # Add threads option to bcftools commands 4376 if threads_bcftools_annotate > 1: 4377 commands_threaded = [] 4378 for command in commands: 4379 commands_threaded.append( 4380 command.replace( 4381 f"{bcftools_bin_command} annotate ", 4382 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4383 ) 4384 ) 4385 commands = commands_threaded 4386 4387 # Command annotation multithreading 4388 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4389 log.info( 4390 f"Annotation - Annotation multithreaded in " 4391 + str(len(commands)) 4392 + " commands" 4393 ) 4394 4395 run_parallel_commands(commands, threads) 4396 4397 # Merge 4398 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4399 4400 if tmp_ann_vcf_list_cmd: 4401 4402 # Tmp file 4403 tmp_annotate_vcf = NamedTemporaryFile( 4404 prefix=self.get_prefix(), 4405 dir=self.get_tmp_dir(), 4406 suffix=".vcf.gz", 4407 delete=True, 4408 ) 4409 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4410 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4411 err_files.append(tmp_annotate_vcf_name_err) 4412 4413 # Tmp file remove command 4414 tmp_files_remove_command = "" 4415 if tmp_files: 4416 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4417 4418 # Command merge 4419 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4420 log.info( 4421 f"Annotation - Annotation merging " 4422 + str(len(commands)) 4423 + " annotated files" 4424 ) 4425 log.debug(f"Annotation - merge command: {merge_command}") 4426 run_parallel_commands([merge_command], 1) 4427 4428 # Error messages 4429 log.info(f"Error/Warning messages:") 4430 error_message_command_all = [] 4431 error_message_command_warning = [] 4432 error_message_command_err = [] 4433 for err_file in err_files: 4434 with open(err_file, "r") as f: 4435 for line in f: 4436 message = line.strip() 4437 error_message_command_all.append(message) 4438 if line.startswith("[W::"): 4439 error_message_command_warning.append(message) 4440 if line.startswith("[E::"): 4441 error_message_command_err.append( 4442 f"{err_file}: " + message 4443 ) 4444 # log info 4445 for message in list( 4446 set(error_message_command_err + error_message_command_warning) 4447 ): 4448 log.info(f" {message}") 4449 # debug info 4450 for message in list(set(error_message_command_all)): 4451 log.debug(f" {message}") 4452 # failed 4453 if len(error_message_command_err): 4454 log.error("Annotation failed: Error in commands") 4455 raise ValueError("Annotation failed: Error in commands") 4456 4457 # Update variants 4458 log.info(f"Annotation - Updating...") 4459 self.update_from_vcf(tmp_annotate_vcf_name) 4460 4461 def annotation_exomiser(self, threads: int = None) -> None: 4462 """ 4463 This function annotate with Exomiser 4464 4465 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4466 - "analysis" (dict/file): 4467 Full analysis dictionnary parameters (see Exomiser docs). 4468 Either a dict, or a file in JSON or YAML format. 4469 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4470 Default : None 4471 - "preset" (string): 4472 Analysis preset (available in config folder). 4473 Used if no full "analysis" is provided. 4474 Default: "exome" 4475 - "phenopacket" (dict/file): 4476 Samples and phenotipic features parameters (see Exomiser docs). 4477 Either a dict, or a file in JSON or YAML format. 4478 Default: None 4479 - "subject" (dict): 4480 Sample parameters (see Exomiser docs). 4481 Example: 4482 "subject": 4483 { 4484 "id": "ISDBM322017", 4485 "sex": "FEMALE" 4486 } 4487 Default: None 4488 - "sample" (string): 4489 Sample name to construct "subject" section: 4490 "subject": 4491 { 4492 "id": "<sample>", 4493 "sex": "UNKNOWN_SEX" 4494 } 4495 Default: None 4496 - "phenotypicFeatures" (dict) 4497 Phenotypic features to construct "subject" section. 4498 Example: 4499 "phenotypicFeatures": 4500 [ 4501 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4502 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4503 ] 4504 - "hpo" (list) 4505 List of HPO ids as phenotypic features. 4506 Example: 4507 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4508 Default: [] 4509 - "outputOptions" (dict): 4510 Output options (see Exomiser docs). 4511 Default: 4512 "output_options" = 4513 { 4514 "outputContributingVariantsOnly": False, 4515 "numGenes": 0, 4516 "outputFormats": ["TSV_VARIANT", "VCF"] 4517 } 4518 - "transcript_source" (string): 4519 Transcript source (either "refseq", "ucsc", "ensembl") 4520 Default: "refseq" 4521 - "exomiser_to_info" (boolean): 4522 Add exomiser TSV file columns as INFO fields in VCF. 4523 Default: False 4524 - "release" (string): 4525 Exomise database release. 4526 If not exists, database release will be downloaded (take a while). 4527 Default: None (provided by application.properties configuration file) 4528 - "exomiser_application_properties" (file): 4529 Exomiser configuration file (see Exomiser docs). 4530 Useful to automatically download databases (especially for specific genome databases). 4531 4532 Notes: 4533 - If no sample in parameters, first sample in VCF will be chosen 4534 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4535 4536 :param threads: The number of threads to use 4537 :return: None. 4538 """ 4539 4540 # DEBUG 4541 log.debug("Start annotation with Exomiser databases") 4542 4543 # Threads 4544 if not threads: 4545 threads = self.get_threads() 4546 log.debug("Threads: " + str(threads)) 4547 4548 # Config 4549 config = self.get_config() 4550 log.debug("Config: " + str(config)) 4551 4552 # Config - Folders - Databases 4553 databases_folders = ( 4554 config.get("folders", {}) 4555 .get("databases", {}) 4556 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4557 ) 4558 databases_folders = full_path(databases_folders) 4559 if not os.path.exists(databases_folders): 4560 log.error(f"Databases annotations: {databases_folders} NOT found") 4561 log.debug("Databases annotations: " + str(databases_folders)) 4562 4563 # Config - Exomiser 4564 exomiser_bin_command = get_bin_command( 4565 bin="exomiser-cli*.jar", 4566 tool="exomiser", 4567 bin_type="jar", 4568 config=config, 4569 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4570 ) 4571 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4572 if not exomiser_bin_command: 4573 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4574 log.error(msg_err) 4575 raise ValueError(msg_err) 4576 4577 # Param 4578 param = self.get_param() 4579 log.debug("Param: " + str(param)) 4580 4581 # Param - Exomiser 4582 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4583 log.debug(f"Param Exomiser: {param_exomiser}") 4584 4585 # Param - Assembly 4586 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4587 log.debug("Assembly: " + str(assembly)) 4588 4589 # Data 4590 table_variants = self.get_table_variants() 4591 4592 # Check if not empty 4593 log.debug("Check if not empty") 4594 sql_query_chromosomes = ( 4595 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4596 ) 4597 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4598 log.info(f"VCF empty") 4599 return False 4600 4601 # VCF header 4602 vcf_reader = self.get_header() 4603 log.debug("Initial header: " + str(vcf_reader.infos)) 4604 4605 # Samples 4606 samples = self.get_header_sample_list() 4607 if not samples: 4608 log.error("No Samples in VCF") 4609 return False 4610 log.debug(f"Samples: {samples}") 4611 4612 # Memory limit 4613 memory_limit = self.get_memory("8G") 4614 log.debug(f"memory_limit: {memory_limit}") 4615 4616 # Exomiser java options 4617 exomiser_java_options = ( 4618 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4619 ) 4620 log.debug(f"Exomiser java options: {exomiser_java_options}") 4621 4622 # Download Exomiser (if not exists) 4623 exomiser_release = param_exomiser.get("release", None) 4624 exomiser_application_properties = param_exomiser.get( 4625 "exomiser_application_properties", None 4626 ) 4627 databases_download_exomiser( 4628 assemblies=[assembly], 4629 exomiser_folder=databases_folders, 4630 exomiser_release=exomiser_release, 4631 exomiser_phenotype_release=exomiser_release, 4632 exomiser_application_properties=exomiser_application_properties, 4633 ) 4634 4635 # Force annotation 4636 force_update_annotation = True 4637 4638 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4639 log.debug("Start annotation Exomiser") 4640 4641 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4642 4643 # tmp_dir = "/tmp/exomiser" 4644 4645 ### ANALYSIS ### 4646 ################ 4647 4648 # Create analysis.json through analysis dict 4649 # either analysis in param or by default 4650 # depending on preset exome/genome) 4651 4652 # Init analysis dict 4653 param_exomiser_analysis_dict = {} 4654 4655 # analysis from param 4656 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4657 param_exomiser_analysis = full_path(param_exomiser_analysis) 4658 4659 # If analysis in param -> load anlaysis json 4660 if param_exomiser_analysis: 4661 4662 # If param analysis is a file and exists 4663 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4664 param_exomiser_analysis 4665 ): 4666 # Load analysis file into analysis dict (either yaml or json) 4667 with open(param_exomiser_analysis) as json_file: 4668 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4669 4670 # If param analysis is a dict 4671 elif isinstance(param_exomiser_analysis, dict): 4672 # Load analysis dict into analysis dict (either yaml or json) 4673 param_exomiser_analysis_dict = param_exomiser_analysis 4674 4675 # Error analysis type 4676 else: 4677 log.error(f"Analysis type unknown. Check param file.") 4678 raise ValueError(f"Analysis type unknown. Check param file.") 4679 4680 # Case no input analysis config file/dict 4681 # Use preset (exome/genome) to open default config file 4682 if not param_exomiser_analysis_dict: 4683 4684 # default preset 4685 default_preset = "exome" 4686 4687 # Get param preset or default preset 4688 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4689 4690 # Try to find if preset is a file 4691 if os.path.exists(param_exomiser_preset): 4692 # Preset file is provided in full path 4693 param_exomiser_analysis_default_config_file = ( 4694 param_exomiser_preset 4695 ) 4696 # elif os.path.exists(full_path(param_exomiser_preset)): 4697 # # Preset file is provided in full path 4698 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4699 elif os.path.exists( 4700 os.path.join(folder_config, param_exomiser_preset) 4701 ): 4702 # Preset file is provided a basename in config folder (can be a path with subfolders) 4703 param_exomiser_analysis_default_config_file = os.path.join( 4704 folder_config, param_exomiser_preset 4705 ) 4706 else: 4707 # Construct preset file 4708 param_exomiser_analysis_default_config_file = os.path.join( 4709 folder_config, 4710 f"preset-{param_exomiser_preset}-analysis.json", 4711 ) 4712 4713 # If preset file exists 4714 param_exomiser_analysis_default_config_file = full_path( 4715 param_exomiser_analysis_default_config_file 4716 ) 4717 if os.path.exists(param_exomiser_analysis_default_config_file): 4718 # Load prest file into analysis dict (either yaml or json) 4719 with open( 4720 param_exomiser_analysis_default_config_file 4721 ) as json_file: 4722 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4723 json_file 4724 ) 4725 4726 # Error preset file 4727 else: 4728 log.error( 4729 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4730 ) 4731 raise ValueError( 4732 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4733 ) 4734 4735 # If no analysis dict created 4736 if not param_exomiser_analysis_dict: 4737 log.error(f"No analysis config") 4738 raise ValueError(f"No analysis config") 4739 4740 # Log 4741 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4742 4743 ### PHENOPACKET ### 4744 ################### 4745 4746 # If no PhenoPacket in analysis dict -> check in param 4747 if "phenopacket" not in param_exomiser_analysis_dict: 4748 4749 # If PhenoPacket in param -> load anlaysis json 4750 if param_exomiser.get("phenopacket", None): 4751 4752 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4753 param_exomiser_phenopacket = full_path( 4754 param_exomiser_phenopacket 4755 ) 4756 4757 # If param phenopacket is a file and exists 4758 if isinstance( 4759 param_exomiser_phenopacket, str 4760 ) and os.path.exists(param_exomiser_phenopacket): 4761 # Load phenopacket file into analysis dict (either yaml or json) 4762 with open(param_exomiser_phenopacket) as json_file: 4763 param_exomiser_analysis_dict["phenopacket"] = ( 4764 yaml.safe_load(json_file) 4765 ) 4766 4767 # If param phenopacket is a dict 4768 elif isinstance(param_exomiser_phenopacket, dict): 4769 # Load phenopacket dict into analysis dict (either yaml or json) 4770 param_exomiser_analysis_dict["phenopacket"] = ( 4771 param_exomiser_phenopacket 4772 ) 4773 4774 # Error phenopacket type 4775 else: 4776 log.error(f"Phenopacket type unknown. Check param file.") 4777 raise ValueError( 4778 f"Phenopacket type unknown. Check param file." 4779 ) 4780 4781 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4782 if "phenopacket" not in param_exomiser_analysis_dict: 4783 4784 # Init PhenoPacket 4785 param_exomiser_analysis_dict["phenopacket"] = { 4786 "id": "analysis", 4787 "proband": {}, 4788 } 4789 4790 ### Add subject ### 4791 4792 # If subject exists 4793 param_exomiser_subject = param_exomiser.get("subject", {}) 4794 4795 # If subject not exists -> found sample ID 4796 if not param_exomiser_subject: 4797 4798 # Found sample ID in param 4799 sample = param_exomiser.get("sample", None) 4800 4801 # Find sample ID (first sample) 4802 if not sample: 4803 sample_list = self.get_header_sample_list() 4804 if len(sample_list) > 0: 4805 sample = sample_list[0] 4806 else: 4807 log.error(f"No sample found") 4808 raise ValueError(f"No sample found") 4809 4810 # Create subject 4811 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4812 4813 # Add to dict 4814 param_exomiser_analysis_dict["phenopacket"][ 4815 "subject" 4816 ] = param_exomiser_subject 4817 4818 ### Add "phenotypicFeatures" ### 4819 4820 # If phenotypicFeatures exists 4821 param_exomiser_phenotypicfeatures = param_exomiser.get( 4822 "phenotypicFeatures", [] 4823 ) 4824 4825 # If phenotypicFeatures not exists -> Try to infer from hpo list 4826 if not param_exomiser_phenotypicfeatures: 4827 4828 # Found HPO in param 4829 param_exomiser_hpo = param_exomiser.get("hpo", []) 4830 4831 # Split HPO if list in string format separated by comma 4832 if isinstance(param_exomiser_hpo, str): 4833 param_exomiser_hpo = param_exomiser_hpo.split(",") 4834 4835 # Create HPO list 4836 for hpo in param_exomiser_hpo: 4837 hpo_clean = re.sub("[^0-9]", "", hpo) 4838 param_exomiser_phenotypicfeatures.append( 4839 { 4840 "type": { 4841 "id": f"HP:{hpo_clean}", 4842 "label": f"HP:{hpo_clean}", 4843 } 4844 } 4845 ) 4846 4847 # Add to dict 4848 param_exomiser_analysis_dict["phenopacket"][ 4849 "phenotypicFeatures" 4850 ] = param_exomiser_phenotypicfeatures 4851 4852 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4853 if not param_exomiser_phenotypicfeatures: 4854 for step in param_exomiser_analysis_dict.get( 4855 "analysis", {} 4856 ).get("steps", []): 4857 if "hiPhivePrioritiser" in step: 4858 param_exomiser_analysis_dict.get("analysis", {}).get( 4859 "steps", [] 4860 ).remove(step) 4861 4862 ### Add Input File ### 4863 4864 # Initial file name and htsFiles 4865 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4866 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4867 { 4868 "uri": tmp_vcf_name, 4869 "htsFormat": "VCF", 4870 "genomeAssembly": assembly, 4871 } 4872 ] 4873 4874 ### Add metaData ### 4875 4876 # If metaData not in analysis dict 4877 if "metaData" not in param_exomiser_analysis_dict: 4878 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4879 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4880 "createdBy": "howard", 4881 "phenopacketSchemaVersion": 1, 4882 } 4883 4884 ### OutputOptions ### 4885 4886 # Init output result folder 4887 output_results = os.path.join(tmp_dir, "results") 4888 4889 # If no outputOptions in analysis dict 4890 if "outputOptions" not in param_exomiser_analysis_dict: 4891 4892 # default output formats 4893 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4894 4895 # Get outputOptions in param 4896 output_options = param_exomiser.get("outputOptions", None) 4897 4898 # If no output_options in param -> check 4899 if not output_options: 4900 output_options = { 4901 "outputContributingVariantsOnly": False, 4902 "numGenes": 0, 4903 "outputFormats": defaut_output_formats, 4904 } 4905 4906 # Replace outputDirectory in output options 4907 output_options["outputDirectory"] = output_results 4908 output_options["outputFileName"] = "howard" 4909 4910 # Add outputOptions in analysis dict 4911 param_exomiser_analysis_dict["outputOptions"] = output_options 4912 4913 else: 4914 4915 # Replace output_results and output format (if exists in param) 4916 param_exomiser_analysis_dict["outputOptions"][ 4917 "outputDirectory" 4918 ] = output_results 4919 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4920 list( 4921 set( 4922 param_exomiser_analysis_dict.get( 4923 "outputOptions", {} 4924 ).get("outputFormats", []) 4925 + ["TSV_VARIANT", "VCF"] 4926 ) 4927 ) 4928 ) 4929 4930 # log 4931 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4932 4933 ### ANALYSIS FILE ### 4934 ##################### 4935 4936 ### Full JSON analysis config file ### 4937 4938 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4939 with open(exomiser_analysis, "w") as fp: 4940 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4941 4942 ### SPLIT analysis and sample config files 4943 4944 # Splitted analysis dict 4945 param_exomiser_analysis_dict_for_split = ( 4946 param_exomiser_analysis_dict.copy() 4947 ) 4948 4949 # Phenopacket JSON file 4950 exomiser_analysis_phenopacket = os.path.join( 4951 tmp_dir, "analysis_phenopacket.json" 4952 ) 4953 with open(exomiser_analysis_phenopacket, "w") as fp: 4954 json.dump( 4955 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4956 fp, 4957 indent=4, 4958 ) 4959 4960 # Analysis JSON file without Phenopacket parameters 4961 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4962 exomiser_analysis_analysis = os.path.join( 4963 tmp_dir, "analysis_analysis.json" 4964 ) 4965 with open(exomiser_analysis_analysis, "w") as fp: 4966 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4967 4968 ### INITAL VCF file ### 4969 ####################### 4970 4971 ### Create list of samples to use and include inti initial VCF file #### 4972 4973 # Subject (main sample) 4974 # Get sample ID in analysis dict 4975 sample_subject = ( 4976 param_exomiser_analysis_dict.get("phenopacket", {}) 4977 .get("subject", {}) 4978 .get("id", None) 4979 ) 4980 sample_proband = ( 4981 param_exomiser_analysis_dict.get("phenopacket", {}) 4982 .get("proband", {}) 4983 .get("subject", {}) 4984 .get("id", None) 4985 ) 4986 sample = [] 4987 if sample_subject: 4988 sample.append(sample_subject) 4989 if sample_proband: 4990 sample.append(sample_proband) 4991 4992 # Get sample ID within Pedigree 4993 pedigree_persons_list = ( 4994 param_exomiser_analysis_dict.get("phenopacket", {}) 4995 .get("pedigree", {}) 4996 .get("persons", {}) 4997 ) 4998 4999 # Create list with all sample ID in pedigree (if exists) 5000 pedigree_persons = [] 5001 for person in pedigree_persons_list: 5002 pedigree_persons.append(person.get("individualId")) 5003 5004 # Concat subject sample ID and samples ID in pedigreesamples 5005 samples = list(set(sample + pedigree_persons)) 5006 5007 # Check if sample list is not empty 5008 if not samples: 5009 log.error(f"No samples found") 5010 raise ValueError(f"No samples found") 5011 5012 # Create VCF with sample (either sample in param or first one by default) 5013 # Export VCF file 5014 self.export_variant_vcf( 5015 vcf_file=tmp_vcf_name, 5016 remove_info=True, 5017 add_samples=True, 5018 list_samples=samples, 5019 index=False, 5020 ) 5021 5022 ### Execute Exomiser ### 5023 ######################## 5024 5025 # Init command 5026 exomiser_command = "" 5027 5028 # Command exomiser options 5029 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5030 5031 # Release 5032 exomiser_release = param_exomiser.get("release", None) 5033 if exomiser_release: 5034 # phenotype data version 5035 exomiser_options += ( 5036 f" --exomiser.phenotype.data-version={exomiser_release} " 5037 ) 5038 # data version 5039 exomiser_options += ( 5040 f" --exomiser.{assembly}.data-version={exomiser_release} " 5041 ) 5042 # variant white list 5043 variant_white_list_file = ( 5044 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5045 ) 5046 if os.path.exists( 5047 os.path.join( 5048 databases_folders, assembly, variant_white_list_file 5049 ) 5050 ): 5051 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5052 5053 # transcript_source 5054 transcript_source = param_exomiser.get( 5055 "transcript_source", None 5056 ) # ucsc, refseq, ensembl 5057 if transcript_source: 5058 exomiser_options += ( 5059 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5060 ) 5061 5062 # If analysis contain proband param 5063 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5064 "proband", {} 5065 ): 5066 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5067 5068 # If no proband (usually uniq sample) 5069 else: 5070 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5071 5072 # Log 5073 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5074 5075 # Run command 5076 result = subprocess.call( 5077 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5078 ) 5079 if result: 5080 log.error("Exomiser command failed") 5081 raise ValueError("Exomiser command failed") 5082 5083 ### RESULTS ### 5084 ############### 5085 5086 ### Annotate with TSV fields ### 5087 5088 # Init result tsv file 5089 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5090 5091 # Init result tsv file 5092 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5093 5094 # Parse TSV file and explode columns in INFO field 5095 if exomiser_to_info and os.path.exists(output_results_tsv): 5096 5097 # Log 5098 log.debug("Exomiser columns to VCF INFO field") 5099 5100 # Retrieve columns and types 5101 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5102 output_results_tsv_df = self.get_query_to_df(query) 5103 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5104 5105 # Init concat fields for update 5106 sql_query_update_concat_fields = [] 5107 5108 # Fields to avoid 5109 fields_to_avoid = [ 5110 "CONTIG", 5111 "START", 5112 "END", 5113 "REF", 5114 "ALT", 5115 "QUAL", 5116 "FILTER", 5117 "GENOTYPE", 5118 ] 5119 5120 # List all columns to add into header 5121 for header_column in output_results_tsv_columns: 5122 5123 # If header column is enable 5124 if header_column not in fields_to_avoid: 5125 5126 # Header info type 5127 header_info_type = "String" 5128 header_column_df = output_results_tsv_df[header_column] 5129 header_column_df_dtype = header_column_df.dtype 5130 if header_column_df_dtype == object: 5131 if ( 5132 pd.to_numeric(header_column_df, errors="coerce") 5133 .notnull() 5134 .all() 5135 ): 5136 header_info_type = "Float" 5137 else: 5138 header_info_type = "Integer" 5139 5140 # Header info 5141 characters_to_validate = ["-"] 5142 pattern = "[" + "".join(characters_to_validate) + "]" 5143 header_info_name = re.sub( 5144 pattern, 5145 "_", 5146 f"Exomiser_{header_column}".replace("#", ""), 5147 ) 5148 header_info_number = "." 5149 header_info_description = ( 5150 f"Exomiser {header_column} annotation" 5151 ) 5152 header_info_source = "Exomiser" 5153 header_info_version = "unknown" 5154 header_info_code = CODE_TYPE_MAP[header_info_type] 5155 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5156 header_info_name, 5157 header_info_number, 5158 header_info_type, 5159 header_info_description, 5160 header_info_source, 5161 header_info_version, 5162 header_info_code, 5163 ) 5164 5165 # Add field to add for update to concat fields 5166 sql_query_update_concat_fields.append( 5167 f""" 5168 CASE 5169 WHEN table_parquet."{header_column}" NOT IN ('','.') 5170 THEN concat( 5171 '{header_info_name}=', 5172 table_parquet."{header_column}", 5173 ';' 5174 ) 5175 5176 ELSE '' 5177 END 5178 """ 5179 ) 5180 5181 # Update query 5182 sql_query_update = f""" 5183 UPDATE {table_variants} as table_variants 5184 SET INFO = concat( 5185 CASE 5186 WHEN INFO NOT IN ('', '.') 5187 THEN INFO 5188 ELSE '' 5189 END, 5190 CASE 5191 WHEN table_variants.INFO NOT IN ('','.') 5192 THEN ';' 5193 ELSE '' 5194 END, 5195 ( 5196 SELECT 5197 concat( 5198 {",".join(sql_query_update_concat_fields)} 5199 ) 5200 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5201 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5202 AND table_parquet.\"START\" = table_variants.\"POS\" 5203 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5204 AND table_parquet.\"REF\" = table_variants.\"REF\" 5205 ) 5206 ) 5207 ; 5208 """ 5209 5210 # Update 5211 self.conn.execute(sql_query_update) 5212 5213 ### Annotate with VCF INFO field ### 5214 5215 # Init result VCF file 5216 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5217 5218 # If VCF exists 5219 if os.path.exists(output_results_vcf): 5220 5221 # Log 5222 log.debug("Exomiser result VCF update variants") 5223 5224 # Find Exomiser INFO field annotation in header 5225 with gzip.open(output_results_vcf, "rt") as f: 5226 header_list = self.read_vcf_header(f) 5227 exomiser_vcf_header = vcf.Reader( 5228 io.StringIO("\n".join(header_list)) 5229 ) 5230 5231 # Add annotation INFO field to header 5232 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5233 5234 # Update variants with VCF 5235 self.update_from_vcf(output_results_vcf) 5236 5237 return True 5238 5239 def annotation_snpeff(self, threads: int = None) -> None: 5240 """ 5241 This function annotate with snpEff 5242 5243 :param threads: The number of threads to use 5244 :return: the value of the variable "return_value". 5245 """ 5246 5247 # DEBUG 5248 log.debug("Start annotation with snpeff databases") 5249 5250 # Threads 5251 if not threads: 5252 threads = self.get_threads() 5253 log.debug("Threads: " + str(threads)) 5254 5255 # DEBUG 5256 delete_tmp = True 5257 if self.get_config().get("verbosity", "warning") in ["debug"]: 5258 delete_tmp = False 5259 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5260 5261 # Config 5262 config = self.get_config() 5263 log.debug("Config: " + str(config)) 5264 5265 # Config - Folders - Databases 5266 databases_folders = ( 5267 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5268 ) 5269 log.debug("Databases annotations: " + str(databases_folders)) 5270 5271 # Config - snpEff bin command 5272 snpeff_bin_command = get_bin_command( 5273 bin="snpEff.jar", 5274 tool="snpeff", 5275 bin_type="jar", 5276 config=config, 5277 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5278 ) 5279 if not snpeff_bin_command: 5280 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5281 log.error(msg_err) 5282 raise ValueError(msg_err) 5283 5284 # Config - snpEff databases 5285 snpeff_databases = ( 5286 config.get("folders", {}) 5287 .get("databases", {}) 5288 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5289 ) 5290 snpeff_databases = full_path(snpeff_databases) 5291 if snpeff_databases is not None and snpeff_databases != "": 5292 log.debug(f"Create snpEff databases folder") 5293 if not os.path.exists(snpeff_databases): 5294 os.makedirs(snpeff_databases) 5295 5296 # Param 5297 param = self.get_param() 5298 log.debug("Param: " + str(param)) 5299 5300 # Param 5301 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5302 log.debug("Options: " + str(options)) 5303 5304 # Param - Assembly 5305 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5306 5307 # Param - Options 5308 snpeff_options = ( 5309 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5310 ) 5311 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5312 snpeff_csvstats = ( 5313 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5314 ) 5315 if snpeff_stats: 5316 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5317 snpeff_stats = full_path(snpeff_stats) 5318 snpeff_options += f" -stats {snpeff_stats}" 5319 if snpeff_csvstats: 5320 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5321 snpeff_csvstats = full_path(snpeff_csvstats) 5322 snpeff_options += f" -csvStats {snpeff_csvstats}" 5323 5324 # Data 5325 table_variants = self.get_table_variants() 5326 5327 # Check if not empty 5328 log.debug("Check if not empty") 5329 sql_query_chromosomes = ( 5330 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5331 ) 5332 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5333 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5334 log.info(f"VCF empty") 5335 return 5336 5337 # Export in VCF 5338 log.debug("Create initial file to annotate") 5339 tmp_vcf = NamedTemporaryFile( 5340 prefix=self.get_prefix(), 5341 dir=self.get_tmp_dir(), 5342 suffix=".vcf.gz", 5343 delete=True, 5344 ) 5345 tmp_vcf_name = tmp_vcf.name 5346 5347 # VCF header 5348 vcf_reader = self.get_header() 5349 log.debug("Initial header: " + str(vcf_reader.infos)) 5350 5351 # Existing annotations 5352 for vcf_annotation in self.get_header().infos: 5353 5354 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5355 log.debug( 5356 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5357 ) 5358 5359 # Memory limit 5360 # if config.get("memory", None): 5361 # memory_limit = config.get("memory", "8G") 5362 # else: 5363 # memory_limit = "8G" 5364 memory_limit = self.get_memory("8G") 5365 log.debug(f"memory_limit: {memory_limit}") 5366 5367 # snpEff java options 5368 snpeff_java_options = ( 5369 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5370 ) 5371 log.debug(f"Exomiser java options: {snpeff_java_options}") 5372 5373 force_update_annotation = True 5374 5375 if "ANN" not in self.get_header().infos or force_update_annotation: 5376 5377 # Check snpEff database 5378 log.debug(f"Check snpEff databases {[assembly]}") 5379 databases_download_snpeff( 5380 folder=snpeff_databases, assemblies=[assembly], config=config 5381 ) 5382 5383 # Export VCF file 5384 self.export_variant_vcf( 5385 vcf_file=tmp_vcf_name, 5386 remove_info=True, 5387 add_samples=False, 5388 index=True, 5389 ) 5390 5391 # Tmp file 5392 err_files = [] 5393 tmp_annotate_vcf = NamedTemporaryFile( 5394 prefix=self.get_prefix(), 5395 dir=self.get_tmp_dir(), 5396 suffix=".vcf", 5397 delete=False, 5398 ) 5399 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5400 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5401 err_files.append(tmp_annotate_vcf_name_err) 5402 5403 # Command 5404 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5405 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5406 run_parallel_commands([snpeff_command], 1) 5407 5408 # Error messages 5409 log.info(f"Error/Warning messages:") 5410 error_message_command_all = [] 5411 error_message_command_warning = [] 5412 error_message_command_err = [] 5413 for err_file in err_files: 5414 with open(err_file, "r") as f: 5415 for line in f: 5416 message = line.strip() 5417 error_message_command_all.append(message) 5418 if line.startswith("[W::"): 5419 error_message_command_warning.append(message) 5420 if line.startswith("[E::"): 5421 error_message_command_err.append(f"{err_file}: " + message) 5422 # log info 5423 for message in list( 5424 set(error_message_command_err + error_message_command_warning) 5425 ): 5426 log.info(f" {message}") 5427 # debug info 5428 for message in list(set(error_message_command_all)): 5429 log.debug(f" {message}") 5430 # failed 5431 if len(error_message_command_err): 5432 log.error("Annotation failed: Error in commands") 5433 raise ValueError("Annotation failed: Error in commands") 5434 5435 # Find annotation in header 5436 with open(tmp_annotate_vcf_name, "rt") as f: 5437 header_list = self.read_vcf_header(f) 5438 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5439 5440 for ann in annovar_vcf_header.infos: 5441 if ann not in self.get_header().infos: 5442 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5443 5444 # Update variants 5445 log.info(f"Annotation - Updating...") 5446 self.update_from_vcf(tmp_annotate_vcf_name) 5447 5448 else: 5449 if "ANN" in self.get_header().infos: 5450 log.debug(f"Existing snpEff annotations in VCF") 5451 if force_update_annotation: 5452 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5453 5454 def annotation_annovar(self, threads: int = None) -> None: 5455 """ 5456 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5457 annotations 5458 5459 :param threads: number of threads to use 5460 :return: the value of the variable "return_value". 5461 """ 5462 5463 # DEBUG 5464 log.debug("Start annotation with Annovar databases") 5465 5466 # Threads 5467 if not threads: 5468 threads = self.get_threads() 5469 log.debug("Threads: " + str(threads)) 5470 5471 # Tmp en Err files 5472 tmp_files = [] 5473 err_files = [] 5474 5475 # DEBUG 5476 delete_tmp = True 5477 if self.get_config().get("verbosity", "warning") in ["debug"]: 5478 delete_tmp = False 5479 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5480 5481 # Config 5482 config = self.get_config() 5483 log.debug("Config: " + str(config)) 5484 5485 # Config - Folders - Databases 5486 databases_folders = ( 5487 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5488 ) 5489 log.debug("Databases annotations: " + str(databases_folders)) 5490 5491 # Config - annovar bin command 5492 annovar_bin_command = get_bin_command( 5493 bin="table_annovar.pl", 5494 tool="annovar", 5495 bin_type="perl", 5496 config=config, 5497 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5498 ) 5499 if not annovar_bin_command: 5500 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5501 log.error(msg_err) 5502 raise ValueError(msg_err) 5503 5504 # Config - BCFTools bin command 5505 bcftools_bin_command = get_bin_command( 5506 bin="bcftools", 5507 tool="bcftools", 5508 bin_type="bin", 5509 config=config, 5510 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5511 ) 5512 if not bcftools_bin_command: 5513 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5514 log.error(msg_err) 5515 raise ValueError(msg_err) 5516 5517 # Config - annovar databases 5518 annovar_databases = ( 5519 config.get("folders", {}) 5520 .get("databases", {}) 5521 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5522 ) 5523 if annovar_databases is not None: 5524 if isinstance(annovar_databases, list): 5525 annovar_databases = full_path(annovar_databases[0]) 5526 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5527 annovar_databases = full_path(annovar_databases) 5528 if not os.path.exists(annovar_databases): 5529 log.info(f"Annovar databases folder '{annovar_databases}' created") 5530 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5531 else: 5532 msg_err = f"Annovar databases configuration failed" 5533 log.error(msg_err) 5534 raise ValueError(msg_err) 5535 5536 # Param 5537 param = self.get_param() 5538 log.debug("Param: " + str(param)) 5539 5540 # Param - options 5541 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5542 log.debug("Options: " + str(options)) 5543 5544 # Param - annotations 5545 annotations = ( 5546 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5547 ) 5548 log.debug("Annotations: " + str(annotations)) 5549 5550 # Param - Assembly 5551 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5552 5553 # Annovar database assembly 5554 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5555 if annovar_databases_assembly != "" and not os.path.exists( 5556 annovar_databases_assembly 5557 ): 5558 os.makedirs(annovar_databases_assembly) 5559 5560 # Data 5561 table_variants = self.get_table_variants() 5562 5563 # Check if not empty 5564 log.debug("Check if not empty") 5565 sql_query_chromosomes = ( 5566 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5567 ) 5568 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5569 if not sql_query_chromosomes_df["count"][0]: 5570 log.info(f"VCF empty") 5571 return 5572 5573 # VCF header 5574 vcf_reader = self.get_header() 5575 log.debug("Initial header: " + str(vcf_reader.infos)) 5576 5577 # Existing annotations 5578 for vcf_annotation in self.get_header().infos: 5579 5580 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5581 log.debug( 5582 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5583 ) 5584 5585 force_update_annotation = True 5586 5587 if annotations: 5588 5589 commands = [] 5590 tmp_annotates_vcf_name_list = [] 5591 5592 # Export in VCF 5593 log.debug("Create initial file to annotate") 5594 tmp_vcf = NamedTemporaryFile( 5595 prefix=self.get_prefix(), 5596 dir=self.get_tmp_dir(), 5597 suffix=".vcf.gz", 5598 delete=False, 5599 ) 5600 tmp_vcf_name = tmp_vcf.name 5601 tmp_files.append(tmp_vcf_name) 5602 tmp_files.append(tmp_vcf_name + ".tbi") 5603 5604 # Export VCF file 5605 self.export_variant_vcf( 5606 vcf_file=tmp_vcf_name, 5607 remove_info=".", 5608 add_samples=False, 5609 index=True, 5610 ) 5611 5612 # Create file for field rename 5613 log.debug("Create file for field rename") 5614 tmp_rename = NamedTemporaryFile( 5615 prefix=self.get_prefix(), 5616 dir=self.get_tmp_dir(), 5617 suffix=".rename", 5618 delete=False, 5619 ) 5620 tmp_rename_name = tmp_rename.name 5621 tmp_files.append(tmp_rename_name) 5622 5623 # Check Annovar database 5624 log.debug( 5625 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5626 ) 5627 databases_download_annovar( 5628 folder=annovar_databases, 5629 files=list(annotations.keys()), 5630 assemblies=[assembly], 5631 ) 5632 5633 for annotation in annotations: 5634 annotation_fields = annotations[annotation] 5635 5636 if not annotation_fields: 5637 annotation_fields = {"INFO": None} 5638 5639 log.info(f"Annotations Annovar - database '{annotation}'") 5640 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5641 5642 # Tmp file for annovar 5643 err_files = [] 5644 tmp_annotate_vcf_directory = TemporaryDirectory( 5645 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5646 ) 5647 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5648 tmp_annotate_vcf_name_annovar = ( 5649 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5650 ) 5651 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5652 err_files.append(tmp_annotate_vcf_name_err) 5653 tmp_files.append(tmp_annotate_vcf_name_err) 5654 5655 # Tmp file final vcf annotated by annovar 5656 tmp_annotate_vcf = NamedTemporaryFile( 5657 prefix=self.get_prefix(), 5658 dir=self.get_tmp_dir(), 5659 suffix=".vcf.gz", 5660 delete=False, 5661 ) 5662 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5663 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5664 tmp_files.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5666 5667 # Number of fields 5668 annotation_list = [] 5669 annotation_renamed_list = [] 5670 5671 for annotation_field in annotation_fields: 5672 5673 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5674 annotation_fields_new_name = annotation_fields.get( 5675 annotation_field, annotation_field 5676 ) 5677 if not annotation_fields_new_name: 5678 annotation_fields_new_name = annotation_field 5679 5680 if ( 5681 force_update_annotation 5682 or annotation_fields_new_name not in self.get_header().infos 5683 ): 5684 annotation_list.append(annotation_field) 5685 annotation_renamed_list.append(annotation_fields_new_name) 5686 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5687 log.warning( 5688 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5689 ) 5690 5691 # Add rename info 5692 run_parallel_commands( 5693 [ 5694 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5695 ], 5696 1, 5697 ) 5698 5699 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5700 log.debug("annotation_list: " + str(annotation_list)) 5701 5702 # protocol 5703 protocol = annotation 5704 5705 # argument 5706 argument = "" 5707 5708 # operation 5709 operation = "f" 5710 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5711 "ensGene" 5712 ): 5713 operation = "g" 5714 if options.get("genebase", None): 5715 argument = f"""'{options.get("genebase","")}'""" 5716 elif annotation in ["cytoBand"]: 5717 operation = "r" 5718 5719 # argument option 5720 argument_option = "" 5721 if argument != "": 5722 argument_option = " --argument " + argument 5723 5724 # command options 5725 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5726 for option in options: 5727 if option not in ["genebase"]: 5728 command_options += f""" --{option}={options[option]}""" 5729 5730 # Command 5731 5732 # Command - Annovar 5733 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5734 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5735 5736 # Command - start pipe 5737 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5738 5739 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5740 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5741 5742 # Command - Special characters (refGene annotation) 5743 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5744 5745 # Command - Clean empty fields (with value ".") 5746 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5747 5748 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5749 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5750 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5751 # for ann in annotation_renamed_list: 5752 for ann in annotation_list: 5753 annovar_fields_to_keep.append(f"^INFO/{ann}") 5754 5755 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5756 5757 # Command - indexing 5758 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5759 5760 log.debug(f"Annotation - Annovar command: {command_annovar}") 5761 run_parallel_commands([command_annovar], 1) 5762 5763 # Error messages 5764 log.info(f"Error/Warning messages:") 5765 error_message_command_all = [] 5766 error_message_command_warning = [] 5767 error_message_command_err = [] 5768 for err_file in err_files: 5769 with open(err_file, "r") as f: 5770 for line in f: 5771 message = line.strip() 5772 error_message_command_all.append(message) 5773 if line.startswith("[W::") or line.startswith("WARNING"): 5774 error_message_command_warning.append(message) 5775 if line.startswith("[E::") or line.startswith("ERROR"): 5776 error_message_command_err.append( 5777 f"{err_file}: " + message 5778 ) 5779 # log info 5780 for message in list( 5781 set(error_message_command_err + error_message_command_warning) 5782 ): 5783 log.info(f" {message}") 5784 # debug info 5785 for message in list(set(error_message_command_all)): 5786 log.debug(f" {message}") 5787 # failed 5788 if len(error_message_command_err): 5789 log.error("Annotation failed: Error in commands") 5790 raise ValueError("Annotation failed: Error in commands") 5791 5792 if tmp_annotates_vcf_name_list: 5793 5794 # List of annotated files 5795 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5796 5797 # Tmp file 5798 tmp_annotate_vcf = NamedTemporaryFile( 5799 prefix=self.get_prefix(), 5800 dir=self.get_tmp_dir(), 5801 suffix=".vcf.gz", 5802 delete=False, 5803 ) 5804 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5805 tmp_files.append(tmp_annotate_vcf_name) 5806 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5807 err_files.append(tmp_annotate_vcf_name_err) 5808 tmp_files.append(tmp_annotate_vcf_name_err) 5809 5810 # Command merge 5811 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5812 log.info( 5813 f"Annotation Annovar - Annotation merging " 5814 + str(len(tmp_annotates_vcf_name_list)) 5815 + " annotated files" 5816 ) 5817 log.debug(f"Annotation - merge command: {merge_command}") 5818 run_parallel_commands([merge_command], 1) 5819 5820 # Find annotation in header 5821 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5822 header_list = self.read_vcf_header(f) 5823 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5824 5825 for ann in annovar_vcf_header.infos: 5826 if ann not in self.get_header().infos: 5827 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5828 5829 # Update variants 5830 log.info(f"Annotation Annovar - Updating...") 5831 self.update_from_vcf(tmp_annotate_vcf_name) 5832 5833 # Clean files 5834 # Tmp file remove command 5835 if True: 5836 tmp_files_remove_command = "" 5837 if tmp_files: 5838 tmp_files_remove_command = " ".join(tmp_files) 5839 clean_command = f" rm -f {tmp_files_remove_command} " 5840 log.debug(f"Annotation Annovar - Annotation cleaning ") 5841 log.debug(f"Annotation - cleaning command: {clean_command}") 5842 run_parallel_commands([clean_command], 1) 5843 5844 # Parquet 5845 def annotation_parquet(self, threads: int = None) -> None: 5846 """ 5847 It takes a VCF file, and annotates it with a parquet file 5848 5849 :param threads: number of threads to use for the annotation 5850 :return: the value of the variable "result". 5851 """ 5852 5853 # DEBUG 5854 log.debug("Start annotation with parquet databases") 5855 5856 # Threads 5857 if not threads: 5858 threads = self.get_threads() 5859 log.debug("Threads: " + str(threads)) 5860 5861 # DEBUG 5862 delete_tmp = True 5863 if self.get_config().get("verbosity", "warning") in ["debug"]: 5864 delete_tmp = False 5865 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5866 5867 # Config 5868 databases_folders = set( 5869 self.get_config() 5870 .get("folders", {}) 5871 .get("databases", {}) 5872 .get("annotations", ["."]) 5873 + self.get_config() 5874 .get("folders", {}) 5875 .get("databases", {}) 5876 .get("parquet", ["."]) 5877 ) 5878 log.debug("Databases annotations: " + str(databases_folders)) 5879 5880 # Param 5881 annotations = ( 5882 self.get_param() 5883 .get("annotation", {}) 5884 .get("parquet", {}) 5885 .get("annotations", None) 5886 ) 5887 log.debug("Annotations: " + str(annotations)) 5888 5889 # Assembly 5890 assembly = self.get_param().get( 5891 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5892 ) 5893 5894 # Force Update Annotation 5895 force_update_annotation = ( 5896 self.get_param() 5897 .get("annotation", {}) 5898 .get("options", {}) 5899 .get("annotations_update", False) 5900 ) 5901 log.debug(f"force_update_annotation={force_update_annotation}") 5902 force_append_annotation = ( 5903 self.get_param() 5904 .get("annotation", {}) 5905 .get("options", {}) 5906 .get("annotations_append", False) 5907 ) 5908 log.debug(f"force_append_annotation={force_append_annotation}") 5909 5910 # Data 5911 table_variants = self.get_table_variants() 5912 5913 # Check if not empty 5914 log.debug("Check if not empty") 5915 sql_query_chromosomes_df = self.get_query_to_df( 5916 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5917 ) 5918 if not sql_query_chromosomes_df["count"][0]: 5919 log.info(f"VCF empty") 5920 return 5921 5922 # VCF header 5923 vcf_reader = self.get_header() 5924 log.debug("Initial header: " + str(vcf_reader.infos)) 5925 5926 # Nb Variants POS 5927 log.debug("NB Variants Start") 5928 nb_variants = self.conn.execute( 5929 f"SELECT count(*) AS count FROM variants" 5930 ).fetchdf()["count"][0] 5931 log.debug("NB Variants Stop") 5932 5933 # Existing annotations 5934 for vcf_annotation in self.get_header().infos: 5935 5936 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5937 log.debug( 5938 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5939 ) 5940 5941 # Added columns 5942 added_columns = [] 5943 5944 # drop indexes 5945 log.debug(f"Drop indexes...") 5946 self.drop_indexes() 5947 5948 if annotations: 5949 5950 if "ALL" in annotations: 5951 5952 all_param = annotations.get("ALL", {}) 5953 all_param_formats = all_param.get("formats", None) 5954 all_param_releases = all_param.get("releases", None) 5955 5956 databases_infos_dict = self.scan_databases( 5957 database_formats=all_param_formats, 5958 database_releases=all_param_releases, 5959 ) 5960 for database_infos in databases_infos_dict.keys(): 5961 if database_infos not in annotations: 5962 annotations[database_infos] = {"INFO": None} 5963 5964 for annotation in annotations: 5965 5966 if annotation in ["ALL"]: 5967 continue 5968 5969 # Annotation Name 5970 annotation_name = os.path.basename(annotation) 5971 5972 # Annotation fields 5973 annotation_fields = annotations[annotation] 5974 if not annotation_fields: 5975 annotation_fields = {"INFO": None} 5976 5977 log.debug(f"Annotation '{annotation_name}'") 5978 log.debug( 5979 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5980 ) 5981 5982 # Create Database 5983 database = Database( 5984 database=annotation, 5985 databases_folders=databases_folders, 5986 assembly=assembly, 5987 ) 5988 5989 # Find files 5990 parquet_file = database.get_database() 5991 parquet_hdr_file = database.get_header_file() 5992 parquet_type = database.get_type() 5993 5994 # Check if files exists 5995 if not parquet_file or not parquet_hdr_file: 5996 msg_err_list = [] 5997 if not parquet_file: 5998 msg_err_list.append( 5999 f"Annotation failed: Annotation file not found" 6000 ) 6001 if parquet_file and not parquet_hdr_file: 6002 msg_err_list.append( 6003 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6004 ) 6005 6006 log.error(". ".join(msg_err_list)) 6007 raise ValueError(". ".join(msg_err_list)) 6008 else: 6009 # Get parquet connexion 6010 parquet_sql_attach = database.get_sql_database_attach( 6011 output="query" 6012 ) 6013 if parquet_sql_attach: 6014 self.conn.execute(parquet_sql_attach) 6015 parquet_file_link = database.get_sql_database_link() 6016 # Log 6017 log.debug( 6018 f"Annotation '{annotation_name}' - file: " 6019 + str(parquet_file) 6020 + " and " 6021 + str(parquet_hdr_file) 6022 ) 6023 6024 # Database full header columns 6025 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6026 parquet_hdr_file 6027 ) 6028 # Log 6029 log.debug( 6030 "Annotation database header columns : " 6031 + str(parquet_hdr_vcf_header_columns) 6032 ) 6033 6034 # Load header as VCF object 6035 parquet_hdr_vcf_header_infos = database.get_header().infos 6036 # Log 6037 log.debug( 6038 "Annotation database header: " 6039 + str(parquet_hdr_vcf_header_infos) 6040 ) 6041 6042 # Get extra infos 6043 parquet_columns = database.get_extra_columns() 6044 # Log 6045 log.debug("Annotation database Columns: " + str(parquet_columns)) 6046 6047 # Add extra columns if "ALL" in annotation_fields 6048 # if "ALL" in annotation_fields: 6049 # allow_add_extra_column = True 6050 if "ALL" in annotation_fields and database.get_extra_columns(): 6051 for extra_column in database.get_extra_columns(): 6052 if ( 6053 extra_column not in annotation_fields 6054 and extra_column.replace("INFO/", "") 6055 not in parquet_hdr_vcf_header_infos 6056 ): 6057 parquet_hdr_vcf_header_infos[extra_column] = ( 6058 vcf.parser._Info( 6059 extra_column, 6060 ".", 6061 "String", 6062 f"{extra_column} description", 6063 "unknown", 6064 "unknown", 6065 self.code_type_map["String"], 6066 ) 6067 ) 6068 6069 # For all fields in database 6070 annotation_fields_all = False 6071 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6072 annotation_fields_all = True 6073 annotation_fields = { 6074 key: key for key in parquet_hdr_vcf_header_infos 6075 } 6076 6077 log.debug( 6078 "Annotation database header - All annotations added: " 6079 + str(annotation_fields) 6080 ) 6081 6082 # Init 6083 6084 # List of annotation fields to use 6085 sql_query_annotation_update_info_sets = [] 6086 6087 # List of annotation to agregate 6088 sql_query_annotation_to_agregate = [] 6089 6090 # Number of fields 6091 nb_annotation_field = 0 6092 6093 # Annotation fields processed 6094 annotation_fields_processed = [] 6095 6096 # Columns mapping 6097 map_columns = database.map_columns( 6098 columns=annotation_fields, prefixes=["INFO/"] 6099 ) 6100 6101 # Query dict for fields to remove (update option) 6102 query_dict_remove = {} 6103 6104 # Fetch Anotation fields 6105 for annotation_field in annotation_fields: 6106 6107 # annotation_field_column 6108 annotation_field_column = map_columns.get( 6109 annotation_field, "INFO" 6110 ) 6111 6112 # field new name, if parametered 6113 annotation_fields_new_name = annotation_fields.get( 6114 annotation_field, annotation_field 6115 ) 6116 if not annotation_fields_new_name: 6117 annotation_fields_new_name = annotation_field 6118 6119 # To annotate 6120 # force_update_annotation = True 6121 # force_append_annotation = True 6122 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6123 if annotation_field in parquet_hdr_vcf_header_infos and ( 6124 force_update_annotation 6125 or force_append_annotation 6126 or ( 6127 annotation_fields_new_name 6128 not in self.get_header().infos 6129 ) 6130 ): 6131 6132 # Add field to annotation to process list 6133 annotation_fields_processed.append( 6134 annotation_fields_new_name 6135 ) 6136 6137 # explode infos for the field 6138 annotation_fields_new_name_info_msg = "" 6139 if ( 6140 force_update_annotation 6141 and annotation_fields_new_name 6142 in self.get_header().infos 6143 ): 6144 # Remove field from INFO 6145 query = f""" 6146 UPDATE {table_variants} as table_variants 6147 SET INFO = REGEXP_REPLACE( 6148 concat(table_variants.INFO,''), 6149 ';*{annotation_fields_new_name}=[^;]*', 6150 '' 6151 ) 6152 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6153 """ 6154 annotation_fields_new_name_info_msg = " [update]" 6155 query_dict_remove[ 6156 f"remove 'INFO/{annotation_fields_new_name}'" 6157 ] = query 6158 6159 # Sep between fields in INFO 6160 nb_annotation_field += 1 6161 if nb_annotation_field > 1: 6162 annotation_field_sep = ";" 6163 else: 6164 annotation_field_sep = "" 6165 6166 log.info( 6167 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6168 ) 6169 6170 # Add INFO field to header 6171 parquet_hdr_vcf_header_infos_number = ( 6172 parquet_hdr_vcf_header_infos[annotation_field].num 6173 or "." 6174 ) 6175 parquet_hdr_vcf_header_infos_type = ( 6176 parquet_hdr_vcf_header_infos[annotation_field].type 6177 or "String" 6178 ) 6179 parquet_hdr_vcf_header_infos_description = ( 6180 parquet_hdr_vcf_header_infos[annotation_field].desc 6181 or f"{annotation_field} description" 6182 ) 6183 parquet_hdr_vcf_header_infos_source = ( 6184 parquet_hdr_vcf_header_infos[annotation_field].source 6185 or "unknown" 6186 ) 6187 parquet_hdr_vcf_header_infos_version = ( 6188 parquet_hdr_vcf_header_infos[annotation_field].version 6189 or "unknown" 6190 ) 6191 6192 vcf_reader.infos[annotation_fields_new_name] = ( 6193 vcf.parser._Info( 6194 annotation_fields_new_name, 6195 parquet_hdr_vcf_header_infos_number, 6196 parquet_hdr_vcf_header_infos_type, 6197 parquet_hdr_vcf_header_infos_description, 6198 parquet_hdr_vcf_header_infos_source, 6199 parquet_hdr_vcf_header_infos_version, 6200 self.code_type_map[ 6201 parquet_hdr_vcf_header_infos_type 6202 ], 6203 ) 6204 ) 6205 6206 # Append 6207 if force_append_annotation: 6208 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6209 else: 6210 query_case_when_append = "" 6211 6212 # Annotation/Update query fields 6213 # Found in INFO column 6214 if ( 6215 annotation_field_column == "INFO" 6216 and "INFO" in parquet_hdr_vcf_header_columns 6217 ): 6218 sql_query_annotation_update_info_sets.append( 6219 f""" 6220 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6221 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6222 ELSE '' 6223 END 6224 """ 6225 ) 6226 # Found in a specific column 6227 else: 6228 sql_query_annotation_update_info_sets.append( 6229 f""" 6230 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6231 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6232 ELSE '' 6233 END 6234 """ 6235 ) 6236 sql_query_annotation_to_agregate.append( 6237 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6238 ) 6239 6240 # Not to annotate 6241 else: 6242 6243 if force_update_annotation: 6244 annotation_message = "forced" 6245 else: 6246 annotation_message = "skipped" 6247 6248 if annotation_field not in parquet_hdr_vcf_header_infos: 6249 log.warning( 6250 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6251 ) 6252 if annotation_fields_new_name in self.get_header().infos: 6253 log.warning( 6254 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6255 ) 6256 6257 # Check if ALL fields have to be annotated. Thus concat all INFO field 6258 # allow_annotation_full_info = True 6259 allow_annotation_full_info = not force_append_annotation 6260 6261 if parquet_type in ["regions"]: 6262 allow_annotation_full_info = False 6263 6264 if ( 6265 allow_annotation_full_info 6266 and nb_annotation_field == len(annotation_fields) 6267 and annotation_fields_all 6268 and ( 6269 "INFO" in parquet_hdr_vcf_header_columns 6270 and "INFO" in database.get_extra_columns() 6271 ) 6272 ): 6273 log.debug("Column INFO annotation enabled") 6274 sql_query_annotation_update_info_sets = [] 6275 sql_query_annotation_update_info_sets.append( 6276 f" table_parquet.INFO " 6277 ) 6278 6279 if sql_query_annotation_update_info_sets: 6280 6281 # Annotate 6282 log.info(f"Annotation '{annotation_name}' - Annotation...") 6283 6284 # Join query annotation update info sets for SQL 6285 sql_query_annotation_update_info_sets_sql = ",".join( 6286 sql_query_annotation_update_info_sets 6287 ) 6288 6289 # Check chromosomes list (and variants infos) 6290 sql_query_chromosomes = f""" 6291 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6292 FROM {table_variants} as table_variants 6293 GROUP BY table_variants."#CHROM" 6294 ORDER BY table_variants."#CHROM" 6295 """ 6296 sql_query_chromosomes_df = self.conn.execute( 6297 sql_query_chromosomes 6298 ).df() 6299 sql_query_chromosomes_dict = { 6300 entry["CHROM"]: { 6301 "count": entry["count_variants"], 6302 "min": entry["min_variants"], 6303 "max": entry["max_variants"], 6304 } 6305 for index, entry in sql_query_chromosomes_df.iterrows() 6306 } 6307 6308 # Init 6309 nb_of_query = 0 6310 nb_of_variant_annotated = 0 6311 query_dict = query_dict_remove 6312 6313 # for chrom in sql_query_chromosomes_df["CHROM"]: 6314 for chrom in sql_query_chromosomes_dict: 6315 6316 # Number of variant by chromosome 6317 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6318 chrom, {} 6319 ).get("count", 0) 6320 6321 log.debug( 6322 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6323 ) 6324 6325 # Annotation with regions database 6326 if parquet_type in ["regions"]: 6327 sql_query_annotation_from_clause = f""" 6328 FROM ( 6329 SELECT 6330 '{chrom}' AS \"#CHROM\", 6331 table_variants_from.\"POS\" AS \"POS\", 6332 {",".join(sql_query_annotation_to_agregate)} 6333 FROM {table_variants} as table_variants_from 6334 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6335 table_parquet_from."#CHROM" = '{chrom}' 6336 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6337 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6338 ) 6339 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6340 GROUP BY table_variants_from.\"POS\" 6341 ) 6342 as table_parquet 6343 """ 6344 6345 sql_query_annotation_where_clause = """ 6346 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6347 AND table_parquet.\"POS\" = table_variants.\"POS\" 6348 """ 6349 6350 # Annotation with variants database 6351 else: 6352 sql_query_annotation_from_clause = f""" 6353 FROM {parquet_file_link} as table_parquet 6354 """ 6355 sql_query_annotation_where_clause = f""" 6356 table_variants."#CHROM" = '{chrom}' 6357 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6358 AND table_parquet.\"POS\" = table_variants.\"POS\" 6359 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6360 AND table_parquet.\"REF\" = table_variants.\"REF\" 6361 """ 6362 6363 # Create update query 6364 sql_query_annotation_chrom_interval_pos = f""" 6365 UPDATE {table_variants} as table_variants 6366 SET INFO = 6367 concat( 6368 CASE WHEN table_variants.INFO NOT IN ('','.') 6369 THEN table_variants.INFO 6370 ELSE '' 6371 END 6372 , 6373 CASE WHEN table_variants.INFO NOT IN ('','.') 6374 AND ( 6375 concat({sql_query_annotation_update_info_sets_sql}) 6376 ) 6377 NOT IN ('','.') 6378 THEN ';' 6379 ELSE '' 6380 END 6381 , 6382 {sql_query_annotation_update_info_sets_sql} 6383 ) 6384 {sql_query_annotation_from_clause} 6385 WHERE {sql_query_annotation_where_clause} 6386 ; 6387 """ 6388 6389 # Add update query to dict 6390 query_dict[ 6391 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6392 ] = sql_query_annotation_chrom_interval_pos 6393 6394 nb_of_query = len(query_dict) 6395 num_query = 0 6396 6397 # SET max_expression_depth TO x 6398 self.conn.execute("SET max_expression_depth TO 10000") 6399 6400 for query_name in query_dict: 6401 query = query_dict[query_name] 6402 num_query += 1 6403 log.info( 6404 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6405 ) 6406 result = self.conn.execute(query) 6407 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6408 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6409 log.info( 6410 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6411 ) 6412 6413 log.info( 6414 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6415 ) 6416 6417 else: 6418 6419 log.info( 6420 f"Annotation '{annotation_name}' - No Annotations available" 6421 ) 6422 6423 log.debug("Final header: " + str(vcf_reader.infos)) 6424 6425 # Remove added columns 6426 for added_column in added_columns: 6427 self.drop_column(column=added_column) 6428 6429 def annotation_splice(self, threads: int = None) -> None: 6430 """ 6431 This function annotate with snpEff 6432 6433 :param threads: The number of threads to use 6434 :return: the value of the variable "return_value". 6435 """ 6436 6437 # DEBUG 6438 log.debug("Start annotation with splice tools") 6439 6440 # Threads 6441 if not threads: 6442 threads = self.get_threads() 6443 log.debug("Threads: " + str(threads)) 6444 6445 # DEBUG 6446 delete_tmp = True 6447 if self.get_config().get("verbosity", "warning") in ["debug"]: 6448 delete_tmp = False 6449 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6450 6451 # Config 6452 config = self.get_config() 6453 log.debug("Config: " + str(config)) 6454 splice_config = config.get("tools", {}).get("splice", {}) 6455 if not splice_config: 6456 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6457 msg_err = "No Splice tool config" 6458 raise ValueError(msg_err) 6459 log.debug(f"splice_config: {splice_config}") 6460 6461 # Config - Folders - Databases 6462 databases_folders = ( 6463 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6464 ) 6465 log.debug("Databases annotations: " + str(databases_folders)) 6466 6467 # Splice docker image 6468 splice_docker_image = splice_config.get("docker").get("image") 6469 6470 # Pull splice image if it's not already there 6471 if not check_docker_image_exists(splice_docker_image): 6472 log.warning( 6473 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6474 ) 6475 try: 6476 command(f"docker pull {splice_config.get('docker').get('image')}") 6477 except subprocess.CalledProcessError: 6478 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6479 log.error(msg_err) 6480 raise ValueError(msg_err) 6481 6482 # Config - splice databases 6483 splice_databases = ( 6484 config.get("folders", {}) 6485 .get("databases", {}) 6486 .get("splice", DEFAULT_SPLICE_FOLDER) 6487 ) 6488 splice_databases = full_path(splice_databases) 6489 6490 # Param 6491 param = self.get_param() 6492 log.debug("Param: " + str(param)) 6493 6494 # Param 6495 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6496 log.debug("Options: " + str(options)) 6497 6498 # Data 6499 table_variants = self.get_table_variants() 6500 6501 # Check if not empty 6502 log.debug("Check if not empty") 6503 sql_query_chromosomes = ( 6504 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6505 ) 6506 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6507 log.info("VCF empty") 6508 return None 6509 6510 # Export in VCF 6511 log.debug("Create initial file to annotate") 6512 6513 # Create output folder / work folder 6514 if options.get("output_folder", ""): 6515 output_folder = options.get("output_folder", "") 6516 if not os.path.exists(output_folder): 6517 Path(output_folder).mkdir(parents=True, exist_ok=True) 6518 else: 6519 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6520 if not os.path.exists(output_folder): 6521 Path(output_folder).mkdir(parents=True, exist_ok=True) 6522 6523 if options.get("workdir", ""): 6524 workdir = options.get("workdir", "") 6525 else: 6526 workdir = "/work" 6527 6528 # Create tmp VCF file 6529 tmp_vcf = NamedTemporaryFile( 6530 prefix=self.get_prefix(), 6531 dir=output_folder, 6532 suffix=".vcf", 6533 delete=False, 6534 ) 6535 tmp_vcf_name = tmp_vcf.name 6536 6537 # VCF header 6538 header = self.get_header() 6539 6540 # Existing annotations 6541 for vcf_annotation in self.get_header().infos: 6542 6543 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6544 log.debug( 6545 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6546 ) 6547 6548 # Memory limit 6549 if config.get("memory", None): 6550 memory_limit = config.get("memory", "8G").upper() 6551 # upper() 6552 else: 6553 memory_limit = "8G" 6554 log.debug(f"memory_limit: {memory_limit}") 6555 6556 # Check number of variants to annotate 6557 where_clause_regex_spliceai = r"SpliceAI_\w+" 6558 where_clause_regex_spip = r"SPiP_\w+" 6559 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6560 df_list_of_variants_to_annotate = self.get_query_to_df( 6561 query=f""" SELECT * FROM variants {where_clause} """ 6562 ) 6563 if len(df_list_of_variants_to_annotate) == 0: 6564 log.warning( 6565 f"No variants to annotate with splice. Variants probably already annotated with splice" 6566 ) 6567 return None 6568 else: 6569 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6570 6571 # Export VCF file 6572 self.export_variant_vcf( 6573 vcf_file=tmp_vcf_name, 6574 remove_info=True, 6575 add_samples=True, 6576 index=False, 6577 where_clause=where_clause, 6578 ) 6579 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6580 if any(value for value in splice_config.values() if value is None): 6581 log.warning("At least one splice config parameter is empty") 6582 # exit annotation_splice 6583 return None 6584 6585 # Params in splice nf 6586 def check_values(dico: dict): 6587 """ 6588 Ensure parameters for NF splice pipeline 6589 """ 6590 for key, val in dico.items(): 6591 if key == "genome": 6592 if any( 6593 assemb in options.get("genome", {}) 6594 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6595 ): 6596 yield f"--{key} hg19" 6597 elif any( 6598 assemb in options.get("genome", {}) 6599 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6600 ): 6601 yield f"--{key} hg38" 6602 elif ( 6603 (isinstance(val, str) and val) 6604 or isinstance(val, int) 6605 or isinstance(val, bool) 6606 ): 6607 yield f"--{key} {val}" 6608 6609 # Genome 6610 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6611 options["genome"] = genome 6612 # NF params 6613 nf_params = [] 6614 # Add options 6615 if options: 6616 log.debug(options) 6617 nf_params = list(check_values(options)) 6618 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6619 else: 6620 log.debug("No NF params provided") 6621 # Add threads 6622 if "threads" not in options.keys(): 6623 nf_params.append(f"--threads {threads}") 6624 # Genome path 6625 genome_path = find_genome( 6626 config.get("folders", {}) 6627 .get("databases", {}) 6628 .get("genomes", DEFAULT_GENOME_FOLDER), 6629 file=f"{genome}.fa", 6630 ) 6631 # Add genome path 6632 if not genome_path: 6633 raise ValueError( 6634 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6635 ) 6636 else: 6637 log.debug(f"Genome: {genome_path}") 6638 nf_params.append(f"--genome_path {genome_path}") 6639 6640 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6641 """ 6642 Setting up updated databases for SPiP and SpliceAI 6643 """ 6644 6645 try: 6646 6647 # SpliceAI assembly transcriptome 6648 spliceai_assembly = os.path.join( 6649 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6650 options.get("genome"), 6651 "transcriptome", 6652 ) 6653 spip_assembly = options.get("genome") 6654 6655 spip = find( 6656 f"transcriptome_{spip_assembly}.RData", 6657 config.get("folders", {}).get("databases", {}).get("spip", {}), 6658 ) 6659 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6660 log.debug(f"SPiP annotations: {spip}") 6661 log.debug(f"SpliceAI annotations: {spliceai}") 6662 if spip and spliceai: 6663 return [ 6664 f"--spip_transcriptome {spip}", 6665 f"--spliceai_transcriptome {spliceai}", 6666 ] 6667 else: 6668 log.warning( 6669 "Can't find splice databases in configuration, use annotations file from image" 6670 ) 6671 except TypeError: 6672 log.warning( 6673 "Can't find splice databases in configuration, use annotations file from image" 6674 ) 6675 return [] 6676 6677 # Add options, check if transcriptome option have already beend provided 6678 if ( 6679 "spip_transcriptome" not in nf_params 6680 and "spliceai_transcriptome" not in nf_params 6681 ): 6682 splice_reference = splice_annotations(options, config) 6683 if splice_reference: 6684 nf_params.extend(splice_reference) 6685 # nf_params.append(f"--output_folder {output_folder}") 6686 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6687 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6688 log.debug(cmd) 6689 splice_config["docker"]["command"] = cmd 6690 6691 # Ensure proxy is set 6692 proxy = [ 6693 f"-e {var}={os.getenv(var)}" 6694 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6695 if os.getenv(var) is not None 6696 ] 6697 docker_cmd = get_bin_command( 6698 tool="splice", 6699 bin_type="docker", 6700 config=config, 6701 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6702 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6703 ) 6704 # print(docker_cmd) 6705 # exit() 6706 # Docker debug 6707 # if splice_config.get("rm_container"): 6708 # rm_container = "--rm" 6709 # else: 6710 # rm_container = "" 6711 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6712 log.debug(docker_cmd) 6713 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6714 log.debug(res.stdout) 6715 if res.stderr: 6716 log.error(res.stderr) 6717 res.check_returncode() 6718 # Update variants 6719 log.info("Annotation - Updating...") 6720 # Test find output vcf 6721 log.debug( 6722 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6723 ) 6724 output_vcf = [] 6725 # Wrong folder to look in 6726 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6727 if ( 6728 files 6729 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6730 ): 6731 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6732 # log.debug(os.listdir(options.get("output_folder"))) 6733 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6734 if not output_vcf: 6735 log.debug( 6736 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6737 ) 6738 else: 6739 # Get new header from annotated vcf 6740 log.debug(f"Initial header: {len(header.infos)} fields") 6741 # Create new header with splice infos 6742 new_vcf = Variants(input=output_vcf[0]) 6743 new_vcf_header = new_vcf.get_header().infos 6744 for keys, infos in new_vcf_header.items(): 6745 if keys not in header.infos.keys(): 6746 header.infos[keys] = infos 6747 log.debug(f"New header: {len(header.infos)} fields") 6748 log.debug(f"Splice tmp output: {output_vcf[0]}") 6749 self.update_from_vcf(output_vcf[0]) 6750 6751 # Remove file 6752 remove_if_exists(output_vcf) 6753 6754 ### 6755 # Prioritization 6756 ### 6757 6758 def get_config_default(self, name: str) -> dict: 6759 """ 6760 The function `get_config_default` returns a dictionary containing default configurations for 6761 various calculations and prioritizations. 6762 6763 :param name: The `get_config_default` function returns a dictionary containing default 6764 configurations for different calculations and prioritizations. The `name` parameter is used to 6765 specify which specific configuration to retrieve from the dictionary 6766 :type name: str 6767 :return: The function `get_config_default` returns a dictionary containing default configuration 6768 settings for different calculations and prioritizations. The specific configuration settings are 6769 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6770 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6771 returned. If there is no match, an empty dictionary is returned. 6772 """ 6773 6774 config_default = { 6775 "calculations": { 6776 "variant_chr_pos_alt_ref": { 6777 "type": "sql", 6778 "name": "variant_chr_pos_alt_ref", 6779 "description": "Create a variant ID with chromosome, position, alt and ref", 6780 "available": False, 6781 "output_column_name": "variant_chr_pos_alt_ref", 6782 "output_column_type": "String", 6783 "output_column_description": "variant ID with chromosome, position, alt and ref", 6784 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6785 "operation_info": True, 6786 }, 6787 "VARTYPE": { 6788 "type": "sql", 6789 "name": "VARTYPE", 6790 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6791 "available": True, 6792 "table": "variants", 6793 "output_column_name": "VARTYPE", 6794 "output_column_type": "String", 6795 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6796 "operation_query": """ 6797 CASE 6798 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6799 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6800 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6801 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6802 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6803 ELSE 'UNDEFINED' 6804 END 6805 """, 6806 "info_fields": ["SVTYPE"], 6807 "operation_info": True, 6808 }, 6809 "snpeff_hgvs": { 6810 "type": "python", 6811 "name": "snpeff_hgvs", 6812 "description": "HGVS nomenclatures from snpEff annotation", 6813 "available": True, 6814 "function_name": "calculation_extract_snpeff_hgvs", 6815 "function_params": ["snpeff_hgvs", "ANN"], 6816 }, 6817 "snpeff_ann_explode": { 6818 "type": "python", 6819 "name": "snpeff_ann_explode", 6820 "description": "Explode snpEff annotations with uniquify values", 6821 "available": True, 6822 "function_name": "calculation_snpeff_ann_explode", 6823 "function_params": [False, "fields", "snpeff_", "ANN"], 6824 }, 6825 "snpeff_ann_explode_uniquify": { 6826 "type": "python", 6827 "name": "snpeff_ann_explode_uniquify", 6828 "description": "Explode snpEff annotations", 6829 "available": True, 6830 "function_name": "calculation_snpeff_ann_explode", 6831 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6832 }, 6833 "snpeff_ann_explode_json": { 6834 "type": "python", 6835 "name": "snpeff_ann_explode_json", 6836 "description": "Explode snpEff annotations in JSON format", 6837 "available": True, 6838 "function_name": "calculation_snpeff_ann_explode", 6839 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6840 }, 6841 "NOMEN": { 6842 "type": "python", 6843 "name": "NOMEN", 6844 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6845 "available": True, 6846 "function_name": "calculation_extract_nomen", 6847 "function_params": [], 6848 }, 6849 "RENAME_INFO_FIELDS": { 6850 "type": "python", 6851 "name": "RENAME_INFO_FIELDS", 6852 "description": "Rename or remove INFO/tags", 6853 "available": True, 6854 "function_name": "calculation_rename_info_fields", 6855 "function_params": [], 6856 }, 6857 "FINDBYPIPELINE": { 6858 "type": "python", 6859 "name": "FINDBYPIPELINE", 6860 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6861 "available": True, 6862 "function_name": "calculation_find_by_pipeline", 6863 "function_params": ["findbypipeline"], 6864 }, 6865 "FINDBYSAMPLE": { 6866 "type": "python", 6867 "name": "FINDBYSAMPLE", 6868 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6869 "available": True, 6870 "function_name": "calculation_find_by_pipeline", 6871 "function_params": ["findbysample"], 6872 }, 6873 "GENOTYPECONCORDANCE": { 6874 "type": "python", 6875 "name": "GENOTYPECONCORDANCE", 6876 "description": "Concordance of genotype for multi caller VCF", 6877 "available": True, 6878 "function_name": "calculation_genotype_concordance", 6879 "function_params": [], 6880 }, 6881 "BARCODE": { 6882 "type": "python", 6883 "name": "BARCODE", 6884 "description": "BARCODE as VaRank tool", 6885 "available": True, 6886 "function_name": "calculation_barcode", 6887 "function_params": [], 6888 }, 6889 "BARCODEFAMILY": { 6890 "type": "python", 6891 "name": "BARCODEFAMILY", 6892 "description": "BARCODEFAMILY as VaRank tool", 6893 "available": True, 6894 "function_name": "calculation_barcode_family", 6895 "function_params": ["BCF"], 6896 }, 6897 "TRIO": { 6898 "type": "python", 6899 "name": "TRIO", 6900 "description": "Inheritance for a trio family", 6901 "available": True, 6902 "function_name": "calculation_trio", 6903 "function_params": [], 6904 }, 6905 "VAF": { 6906 "type": "python", 6907 "name": "VAF", 6908 "description": "Variant Allele Frequency (VAF) harmonization", 6909 "available": True, 6910 "function_name": "calculation_vaf_normalization", 6911 "function_params": [], 6912 }, 6913 "VAF_stats": { 6914 "type": "python", 6915 "name": "VAF_stats", 6916 "description": "Variant Allele Frequency (VAF) statistics", 6917 "available": True, 6918 "function_name": "calculation_genotype_stats", 6919 "function_params": ["VAF"], 6920 }, 6921 "DP_stats": { 6922 "type": "python", 6923 "name": "DP_stats", 6924 "description": "Depth (DP) statistics", 6925 "available": True, 6926 "function_name": "calculation_genotype_stats", 6927 "function_params": ["DP"], 6928 }, 6929 "variant_id": { 6930 "type": "python", 6931 "name": "variant_id", 6932 "description": "Variant ID generated from variant position and type", 6933 "available": True, 6934 "function_name": "calculation_variant_id", 6935 "function_params": [], 6936 }, 6937 "transcripts_json": { 6938 "type": "python", 6939 "name": "transcripts_json", 6940 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6941 "available": True, 6942 "function_name": "calculation_transcripts_annotation", 6943 "function_params": ["transcripts_json", None], 6944 }, 6945 "transcripts_ann": { 6946 "type": "python", 6947 "name": "transcripts_ann", 6948 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6949 "available": True, 6950 "function_name": "calculation_transcripts_annotation", 6951 "function_params": [None, "transcripts_ann"], 6952 }, 6953 "transcripts_annotations": { 6954 "type": "python", 6955 "name": "transcripts_annotations", 6956 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6957 "available": True, 6958 "function_name": "calculation_transcripts_annotation", 6959 "function_params": [None, None], 6960 }, 6961 "transcripts_prioritization": { 6962 "type": "python", 6963 "name": "transcripts_prioritization", 6964 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6965 "available": True, 6966 "function_name": "calculation_transcripts_prioritization", 6967 "function_params": [], 6968 }, 6969 "transcripts_export": { 6970 "type": "python", 6971 "name": "transcripts_export", 6972 "description": "Export transcripts table/view as a file (using param.json)", 6973 "available": True, 6974 "function_name": "calculation_transcripts_export", 6975 "function_params": [], 6976 }, 6977 }, 6978 "prioritizations": { 6979 "default": { 6980 "ANN2": [ 6981 { 6982 "type": "contains", 6983 "value": "HIGH", 6984 "score": 5, 6985 "flag": "PASS", 6986 "comment": [ 6987 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6988 ], 6989 }, 6990 { 6991 "type": "contains", 6992 "value": "MODERATE", 6993 "score": 3, 6994 "flag": "PASS", 6995 "comment": [ 6996 "A non-disruptive variant that might change protein effectiveness" 6997 ], 6998 }, 6999 { 7000 "type": "contains", 7001 "value": "LOW", 7002 "score": 0, 7003 "flag": "FILTERED", 7004 "comment": [ 7005 "Assumed to be mostly harmless or unlikely to change protein behavior" 7006 ], 7007 }, 7008 { 7009 "type": "contains", 7010 "value": "MODIFIER", 7011 "score": 0, 7012 "flag": "FILTERED", 7013 "comment": [ 7014 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7015 ], 7016 }, 7017 ], 7018 } 7019 }, 7020 } 7021 7022 return config_default.get(name, None) 7023 7024 def get_config_json( 7025 self, name: str, config_dict: dict = {}, config_file: str = None 7026 ) -> dict: 7027 """ 7028 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7029 default values, a dictionary, and a file. 7030 7031 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7032 the name of the configuration. It is used to identify and retrieve the configuration settings 7033 for a specific component or module 7034 :type name: str 7035 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7036 dictionary that allows you to provide additional configuration settings or overrides. When you 7037 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7038 the key is the configuration setting you want to override or 7039 :type config_dict: dict 7040 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7041 specify the path to a configuration file that contains additional settings. If provided, the 7042 function will read the contents of this file and update the configuration dictionary with the 7043 values found in the file, overriding any existing values with the 7044 :type config_file: str 7045 :return: The function `get_config_json` returns a dictionary containing the configuration 7046 settings. 7047 """ 7048 7049 # Create with default prioritizations 7050 config_default = self.get_config_default(name=name) 7051 configuration = config_default 7052 # log.debug(f"configuration={configuration}") 7053 7054 # Replace prioritizations from dict 7055 for config in config_dict: 7056 configuration[config] = config_dict[config] 7057 7058 # Replace prioritizations from file 7059 config_file = full_path(config_file) 7060 if config_file: 7061 if os.path.exists(config_file): 7062 with open(config_file) as config_file_content: 7063 config_file_dict = yaml.safe_load(config_file_content) 7064 for config in config_file_dict: 7065 configuration[config] = config_file_dict[config] 7066 else: 7067 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7068 log.error(msg_error) 7069 raise ValueError(msg_error) 7070 7071 return configuration 7072 7073 def prioritization( 7074 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7075 ) -> bool: 7076 """ 7077 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7078 prioritizes variants based on configured profiles and criteria. 7079 7080 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7081 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7082 a table name is provided, the method will prioritize the variants in that specific table 7083 :type table: str 7084 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7085 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7086 provided, the code will use a default prefix value of "PZ" 7087 :type pz_prefix: str 7088 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7089 additional parameters specific to the prioritization process. These parameters can include 7090 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7091 configurations needed for the prioritization of variants in a V 7092 :type pz_param: dict 7093 :return: A boolean value (True) is being returned from the `prioritization` function. 7094 """ 7095 7096 # Config 7097 config = self.get_config() 7098 7099 # Param 7100 param = self.get_param() 7101 7102 # Prioritization param 7103 if pz_param is not None: 7104 prioritization_param = pz_param 7105 else: 7106 prioritization_param = param.get("prioritization", {}) 7107 7108 # Configuration profiles 7109 prioritization_config_file = prioritization_param.get( 7110 "prioritization_config", None 7111 ) 7112 prioritization_config_file = full_path(prioritization_config_file) 7113 prioritizations_config = self.get_config_json( 7114 name="prioritizations", config_file=prioritization_config_file 7115 ) 7116 7117 # Prioritization prefix 7118 pz_prefix_default = "PZ" 7119 if pz_prefix is None: 7120 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7121 7122 # Prioritization options 7123 profiles = prioritization_param.get("profiles", []) 7124 if isinstance(profiles, str): 7125 profiles = profiles.split(",") 7126 pzfields = prioritization_param.get( 7127 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7128 ) 7129 if isinstance(pzfields, str): 7130 pzfields = pzfields.split(",") 7131 default_profile = prioritization_param.get("default_profile", None) 7132 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7133 prioritization_score_mode = prioritization_param.get( 7134 "prioritization_score_mode", "HOWARD" 7135 ) 7136 7137 # Quick Prioritizations 7138 prioritizations = param.get("prioritizations", None) 7139 if prioritizations: 7140 log.info("Quick Prioritization:") 7141 for profile in prioritizations.split(","): 7142 if profile not in profiles: 7143 profiles.append(profile) 7144 log.info(f" {profile}") 7145 7146 # If profile "ALL" provided, all profiles in the config profiles 7147 if "ALL" in profiles: 7148 profiles = list(prioritizations_config.keys()) 7149 7150 for profile in profiles: 7151 if prioritizations_config.get(profile, None): 7152 log.debug(f"Profile '{profile}' configured") 7153 else: 7154 msg_error = f"Profile '{profile}' NOT configured" 7155 log.error(msg_error) 7156 raise ValueError(msg_error) 7157 7158 if profiles: 7159 log.info(f"Prioritization... ") 7160 else: 7161 log.debug(f"No profile defined") 7162 return False 7163 7164 if not default_profile and len(profiles): 7165 default_profile = profiles[0] 7166 7167 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7168 log.debug("Profiles to check: " + str(list(profiles))) 7169 7170 # Variables 7171 if table is not None: 7172 table_variants = table 7173 else: 7174 table_variants = self.get_table_variants(clause="update") 7175 log.debug(f"Table to prioritize: {table_variants}") 7176 7177 # Added columns 7178 added_columns = [] 7179 7180 # Create list of PZfields 7181 # List of PZFields 7182 list_of_pzfields_original = pzfields + [ 7183 pzfield + pzfields_sep + profile 7184 for pzfield in pzfields 7185 for profile in profiles 7186 ] 7187 list_of_pzfields = [] 7188 log.debug(f"{list_of_pzfields_original}") 7189 7190 # Remove existing PZfields to use if exists 7191 for pzfield in list_of_pzfields_original: 7192 if self.get_header().infos.get(pzfield, None) is None: 7193 list_of_pzfields.append(pzfield) 7194 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7195 else: 7196 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7197 7198 if list_of_pzfields: 7199 7200 # Explode Infos prefix 7201 explode_infos_prefix = self.get_explode_infos_prefix() 7202 7203 # PZfields tags description 7204 PZfields_INFOS = { 7205 f"{pz_prefix}Tags": { 7206 "ID": f"{pz_prefix}Tags", 7207 "Number": ".", 7208 "Type": "String", 7209 "Description": "Variant tags based on annotation criteria", 7210 }, 7211 f"{pz_prefix}Score": { 7212 "ID": f"{pz_prefix}Score", 7213 "Number": 1, 7214 "Type": "Integer", 7215 "Description": "Variant score based on annotation criteria", 7216 }, 7217 f"{pz_prefix}Flag": { 7218 "ID": f"{pz_prefix}Flag", 7219 "Number": 1, 7220 "Type": "String", 7221 "Description": "Variant flag based on annotation criteria", 7222 }, 7223 f"{pz_prefix}Comment": { 7224 "ID": f"{pz_prefix}Comment", 7225 "Number": ".", 7226 "Type": "String", 7227 "Description": "Variant comment based on annotation criteria", 7228 }, 7229 f"{pz_prefix}Infos": { 7230 "ID": f"{pz_prefix}Infos", 7231 "Number": ".", 7232 "Type": "String", 7233 "Description": "Variant infos based on annotation criteria", 7234 }, 7235 f"{pz_prefix}Class": { 7236 "ID": f"{pz_prefix}Class", 7237 "Number": ".", 7238 "Type": "String", 7239 "Description": "Variant class based on annotation criteria", 7240 }, 7241 } 7242 7243 # Create INFO fields if not exist 7244 for field in PZfields_INFOS: 7245 field_ID = PZfields_INFOS[field]["ID"] 7246 field_description = PZfields_INFOS[field]["Description"] 7247 if field_ID not in self.get_header().infos and field_ID in pzfields: 7248 field_description = ( 7249 PZfields_INFOS[field]["Description"] 7250 + f", profile {default_profile}" 7251 ) 7252 self.get_header().infos[field_ID] = vcf.parser._Info( 7253 field_ID, 7254 PZfields_INFOS[field]["Number"], 7255 PZfields_INFOS[field]["Type"], 7256 field_description, 7257 "unknown", 7258 "unknown", 7259 code_type_map[PZfields_INFOS[field]["Type"]], 7260 ) 7261 7262 # Create INFO fields if not exist for each profile 7263 for profile in prioritizations_config: 7264 if profile in profiles or profiles == []: 7265 for field in PZfields_INFOS: 7266 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7267 field_description = ( 7268 PZfields_INFOS[field]["Description"] 7269 + f", profile {profile}" 7270 ) 7271 if ( 7272 field_ID not in self.get_header().infos 7273 and field in pzfields 7274 ): 7275 self.get_header().infos[field_ID] = vcf.parser._Info( 7276 field_ID, 7277 PZfields_INFOS[field]["Number"], 7278 PZfields_INFOS[field]["Type"], 7279 field_description, 7280 "unknown", 7281 "unknown", 7282 code_type_map[PZfields_INFOS[field]["Type"]], 7283 ) 7284 7285 # Header 7286 for pzfield in list_of_pzfields: 7287 if re.match(f"{pz_prefix}Score.*", pzfield): 7288 added_column = self.add_column( 7289 table_name=table_variants, 7290 column_name=pzfield, 7291 column_type="INTEGER", 7292 default_value="0", 7293 ) 7294 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7295 added_column = self.add_column( 7296 table_name=table_variants, 7297 column_name=pzfield, 7298 column_type="BOOLEAN", 7299 default_value="1", 7300 ) 7301 elif re.match(f"{pz_prefix}Class.*", pzfield): 7302 added_column = self.add_column( 7303 table_name=table_variants, 7304 column_name=pzfield, 7305 column_type="VARCHAR[]", 7306 default_value="null", 7307 ) 7308 else: 7309 added_column = self.add_column( 7310 table_name=table_variants, 7311 column_name=pzfield, 7312 column_type="STRING", 7313 default_value="''", 7314 ) 7315 added_columns.append(added_column) 7316 7317 # Profiles 7318 if profiles: 7319 7320 # foreach profile in configuration file 7321 for profile in prioritizations_config: 7322 7323 # If profile is asked in param, or ALL are asked (empty profile []) 7324 if profile in profiles or profiles == []: 7325 log.info(f"Profile '{profile}'") 7326 7327 sql_set_info_option = "" 7328 7329 sql_set_info = [] 7330 7331 # PZ fields set 7332 7333 # PZScore 7334 if ( 7335 f"{pz_prefix}Score{pzfields_sep}{profile}" 7336 in list_of_pzfields 7337 ): 7338 sql_set_info.append( 7339 f""" 7340 concat( 7341 '{pz_prefix}Score{pzfields_sep}{profile}=', 7342 {pz_prefix}Score{pzfields_sep}{profile} 7343 ) 7344 """ 7345 ) 7346 if ( 7347 profile == default_profile 7348 and f"{pz_prefix}Score" in list_of_pzfields 7349 ): 7350 sql_set_info.append( 7351 f""" 7352 concat( 7353 '{pz_prefix}Score=', 7354 {pz_prefix}Score{pzfields_sep}{profile} 7355 ) 7356 """ 7357 ) 7358 7359 # PZFlag 7360 if ( 7361 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7362 in list_of_pzfields 7363 ): 7364 sql_set_info.append( 7365 f""" 7366 concat( 7367 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7368 CASE 7369 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7370 THEN 'PASS' 7371 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7372 THEN 'FILTERED' 7373 END 7374 ) 7375 """ 7376 ) 7377 if ( 7378 profile == default_profile 7379 and f"{pz_prefix}Flag" in list_of_pzfields 7380 ): 7381 sql_set_info.append( 7382 f""" 7383 concat( 7384 '{pz_prefix}Flag=', 7385 CASE 7386 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7387 THEN 'PASS' 7388 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7389 THEN 'FILTERED' 7390 END 7391 ) 7392 """ 7393 ) 7394 7395 # PZClass 7396 if ( 7397 f"{pz_prefix}Class{pzfields_sep}{profile}" 7398 in list_of_pzfields 7399 ): 7400 sql_set_info.append( 7401 f""" 7402 concat( 7403 '{pz_prefix}Class{pzfields_sep}{profile}=', 7404 CASE 7405 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7406 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7407 ELSE '.' 7408 END 7409 ) 7410 7411 """ 7412 ) 7413 if ( 7414 profile == default_profile 7415 and f"{pz_prefix}Class" in list_of_pzfields 7416 ): 7417 sql_set_info.append( 7418 f""" 7419 concat( 7420 '{pz_prefix}Class=', 7421 CASE 7422 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7423 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7424 ELSE '.' 7425 END 7426 ) 7427 """ 7428 ) 7429 7430 # PZComment 7431 if ( 7432 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7433 in list_of_pzfields 7434 ): 7435 sql_set_info.append( 7436 f""" 7437 CASE 7438 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7439 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7440 ELSE '' 7441 END 7442 """ 7443 ) 7444 if ( 7445 profile == default_profile 7446 and f"{pz_prefix}Comment" in list_of_pzfields 7447 ): 7448 sql_set_info.append( 7449 f""" 7450 CASE 7451 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7452 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7453 ELSE '' 7454 END 7455 """ 7456 ) 7457 7458 # PZInfos 7459 if ( 7460 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7461 in list_of_pzfields 7462 ): 7463 sql_set_info.append( 7464 f""" 7465 CASE 7466 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7467 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7468 ELSE '' 7469 END 7470 """ 7471 ) 7472 if ( 7473 profile == default_profile 7474 and f"{pz_prefix}Infos" in list_of_pzfields 7475 ): 7476 sql_set_info.append( 7477 f""" 7478 CASE 7479 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7480 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7481 ELSE '' 7482 END 7483 """ 7484 ) 7485 7486 # Merge PZfields 7487 sql_set_info_option = "" 7488 sql_set_sep = "" 7489 for sql_set in sql_set_info: 7490 if sql_set_sep: 7491 sql_set_info_option += f""" 7492 , concat('{sql_set_sep}', {sql_set}) 7493 """ 7494 else: 7495 sql_set_info_option += f""" 7496 , {sql_set} 7497 """ 7498 sql_set_sep = ";" 7499 7500 sql_queries = [] 7501 criterion_fields_profile = [] 7502 annotation_view_name = ( 7503 "annotation_view_for_prioritization_" 7504 + str(random.randrange(1000)) 7505 ) 7506 annotation_view_prefix = "" 7507 for annotation in prioritizations_config[profile]: 7508 7509 # skip special sections 7510 if annotation.startswith("_"): 7511 continue 7512 7513 # For each criterions 7514 for criterion in prioritizations_config[profile][ 7515 annotation 7516 ]: 7517 7518 # Criterion mode 7519 criterion_mode = None 7520 if np.any( 7521 np.isin(list(criterion.keys()), ["type", "value"]) 7522 ): 7523 criterion_mode = "operation" 7524 elif np.any( 7525 np.isin(list(criterion.keys()), ["sql", "fields"]) 7526 ): 7527 criterion_mode = "sql" 7528 log.debug(f"Criterion Mode: {criterion_mode}") 7529 7530 # Criterion parameters 7531 criterion_type = criterion.get("type", None) 7532 criterion_value = criterion.get("value", None) 7533 criterion_sql = criterion.get("sql", None) 7534 criterion_fields = criterion.get("fields", None) 7535 criterion_score = criterion.get("score", 0) 7536 criterion_flag = criterion.get("flag", "PASS") 7537 criterion_class = criterion.get("class", None) 7538 criterion_flag_bool = criterion_flag == "PASS" 7539 criterion_comment = ( 7540 ", ".join(criterion.get("comment", [])) 7541 .replace("'", "''") 7542 .replace(";", ",") 7543 .replace("\t", " ") 7544 ) 7545 criterion_infos = ( 7546 str(criterion) 7547 .replace("'", "''") 7548 .replace(";", ",") 7549 .replace("\t", " ") 7550 ) 7551 7552 # SQL 7553 if criterion_sql is not None and isinstance( 7554 criterion_sql, list 7555 ): 7556 criterion_sql = " ".join(criterion_sql) 7557 7558 # Fields and explode 7559 if criterion_fields is None: 7560 criterion_fields = [annotation] 7561 if not isinstance(criterion_fields, list): 7562 criterion_fields = str(criterion_fields).split(",") 7563 7564 # Class 7565 if criterion_class is not None and not isinstance( 7566 criterion_class, list 7567 ): 7568 criterion_class = str(criterion_class).split(",") 7569 7570 # Add criterion fields to the list of profile's criteria 7571 criterion_fields_profile = list( 7572 set(criterion_fields_profile + criterion_fields) 7573 ) 7574 7575 sql_set = [] 7576 sql_set_info = [] 7577 7578 # PZ fields set 7579 7580 # PZScore 7581 if ( 7582 f"{pz_prefix}Score{pzfields_sep}{profile}" 7583 in list_of_pzfields 7584 ): 7585 # VaRank prioritization score mode 7586 if prioritization_score_mode.upper().strip() in [ 7587 "VARANK", 7588 "MAX", 7589 "MAXIMUM", 7590 "TOP", 7591 ]: 7592 sql_set.append( 7593 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7594 ) 7595 # default HOWARD prioritization score mode 7596 else: 7597 sql_set.append( 7598 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7599 ) 7600 7601 # PZFlag 7602 if ( 7603 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7604 in list_of_pzfields 7605 ): 7606 sql_set.append( 7607 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7608 ) 7609 7610 # PZClass 7611 if ( 7612 f"{pz_prefix}Class{pzfields_sep}{profile}" 7613 in list_of_pzfields 7614 and criterion_class is not None 7615 ): 7616 sql_set.append( 7617 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7618 ) 7619 7620 # PZComment 7621 if ( 7622 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7623 in list_of_pzfields 7624 ): 7625 sql_set.append( 7626 f""" 7627 {pz_prefix}Comment{pzfields_sep}{profile} = 7628 concat( 7629 {pz_prefix}Comment{pzfields_sep}{profile}, 7630 CASE 7631 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7632 THEN ', ' 7633 ELSE '' 7634 END, 7635 '{criterion_comment}' 7636 ) 7637 """ 7638 ) 7639 7640 # PZInfos 7641 if ( 7642 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7643 in list_of_pzfields 7644 ): 7645 sql_set.append( 7646 f""" 7647 {pz_prefix}Infos{pzfields_sep}{profile} = 7648 concat( 7649 {pz_prefix}Infos{pzfields_sep}{profile}, 7650 '{criterion_infos}' 7651 ) 7652 """ 7653 ) 7654 sql_set_option = ",".join(sql_set) 7655 7656 # Criterion and comparison 7657 if sql_set_option: 7658 7659 # Operation mode 7660 if criterion_mode in ["operation"]: 7661 7662 # Check if value is a float 7663 try: 7664 float(criterion_value) 7665 sql_update = f""" 7666 UPDATE "{table_variants}" 7667 SET {sql_set_option} 7668 FROM ( 7669 SELECT * 7670 FROM "{annotation_view_name}" 7671 WHERE ( 7672 CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7673 AND CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7674 ) 7675 ) AS "{annotation_view_name}" 7676 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7677 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7678 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7679 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7680 7681 """ 7682 # If not a floatÃ’ 7683 except: 7684 contains_option = "" 7685 if criterion_type == "contains": 7686 contains_option = ".*" 7687 sql_update = f""" 7688 UPDATE "{table_variants}" 7689 SET {sql_set_option} 7690 FROM ( 7691 SELECT * 7692 FROM "{annotation_view_name}" 7693 WHERE ( 7694 "{annotation_view_name}"."{annotation_view_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7695 ) 7696 ) AS "{annotation_view_name}" 7697 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7698 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7699 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7700 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7701 7702 """ 7703 sql_queries.append(sql_update) 7704 7705 # SQL mode 7706 elif criterion_mode in ["sql"]: 7707 7708 sql_update = f""" 7709 UPDATE {table_variants} 7710 SET {sql_set_option} 7711 FROM ( 7712 SELECT * 7713 FROM "{annotation_view_name}" 7714 WHERE ({criterion_sql}) 7715 ) AS "{annotation_view_name}" 7716 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7717 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7718 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7719 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7720 """ 7721 sql_queries.append(sql_update) 7722 7723 else: 7724 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7725 log.error(msg_err) 7726 raise ValueError(msg_err) 7727 7728 else: 7729 log.warning( 7730 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7731 ) 7732 7733 # PZTags 7734 if ( 7735 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7736 in list_of_pzfields 7737 ): 7738 7739 # Create PZFalgs value 7740 pztags_value = "" 7741 pztags_sep_default = "," 7742 pztags_sep = "" 7743 for pzfield in pzfields: 7744 if pzfield not in [f"{pz_prefix}Tags"]: 7745 if ( 7746 f"{pzfield}{pzfields_sep}{profile}" 7747 in list_of_pzfields 7748 ): 7749 if pzfield in [f"{pz_prefix}Flag"]: 7750 pztags_value += f"""{pztags_sep}{pzfield}#', 7751 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7752 THEN 'PASS' 7753 ELSE 'FILTERED' 7754 END, '""" 7755 elif pzfield in [f"{pz_prefix}Class"]: 7756 pztags_value += f"""{pztags_sep}{pzfield}#', 7757 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7758 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7759 ELSE '.' 7760 END, '""" 7761 else: 7762 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7763 pztags_sep = pztags_sep_default 7764 7765 # Add Query update for PZFlags 7766 sql_update_pztags = f""" 7767 UPDATE {table_variants} 7768 SET INFO = concat( 7769 INFO, 7770 CASE WHEN INFO NOT in ('','.') 7771 THEN ';' 7772 ELSE '' 7773 END, 7774 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7775 ) 7776 WHERE 1=1 7777 """ 7778 sql_queries.append(sql_update_pztags) 7779 7780 # Add Query update for PZFlags for default 7781 if profile == default_profile: 7782 sql_update_pztags_default = f""" 7783 UPDATE {table_variants} 7784 SET INFO = concat( 7785 INFO, 7786 ';', 7787 '{pz_prefix}Tags={pztags_value}' 7788 ) 7789 WHERE 1=1 7790 """ 7791 sql_queries.append(sql_update_pztags_default) 7792 7793 log.info(f"""Profile '{profile}' - Prioritization... """) 7794 7795 # Create annotations view for prioritization 7796 log.debug( 7797 f"""Profile '{profile}' - Prioritization - Create '{annotation_view_name}' view with '{criterion_fields_profile}'... """ 7798 ) 7799 annotation_view = self.create_annotations_view( 7800 view=annotation_view_name, 7801 prefix=annotation_view_prefix, 7802 fields=criterion_fields_profile, 7803 drop_view=True, 7804 ) 7805 7806 # Chromosomes list 7807 sql_uniq_chrom = f""" 7808 SELECT DISTINCT "#CHROM" 7809 FROM {table_variants} 7810 """ 7811 chroms = self.get_query_to_df(sql_uniq_chrom)["#CHROM"].tolist() 7812 7813 for chrom in chroms: 7814 7815 log.debug( 7816 f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}'... """ 7817 ) 7818 7819 if sql_queries: 7820 7821 # Query num 7822 num_query = 0 7823 7824 # For each query 7825 for sql_query in sql_queries: 7826 7827 # Query num 7828 num_query += 1 7829 7830 sql_query_chrom = f""" 7831 {sql_query} 7832 AND {table_variants}."#CHROM" LIKE '{chrom}' 7833 """ 7834 log.debug( 7835 f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}' [{num_query}/{len(sql_queries)}]""" 7836 ) 7837 # log.debug(f"""sql_query_chrom: {sql_query_chrom}""") 7838 self.execute_query(query=sql_query_chrom) 7839 7840 # Update INFO field 7841 log.info(f"""Profile '{profile}' - Update... """) 7842 sql_query_update = f""" 7843 UPDATE {table_variants} 7844 SET INFO = 7845 concat( 7846 CASE 7847 WHEN INFO NOT IN ('','.') 7848 THEN concat(INFO, ';') 7849 ELSE '' 7850 END 7851 {sql_set_info_option} 7852 ) 7853 """ 7854 # log.debug(f"sql_query_update={sql_query_update}") 7855 self.execute_query(query=sql_query_update) 7856 7857 # Remove annotations view for prioritization 7858 query_drop_tmp_table = f""" 7859 DROP VIEW IF EXISTS {annotation_view_name} 7860 """ 7861 self.execute_query(query=query_drop_tmp_table) 7862 7863 else: 7864 7865 log.warning(f"No profiles in parameters") 7866 7867 # Remove added columns 7868 for added_column in added_columns: 7869 self.drop_column(column=added_column) 7870 7871 # Explode INFOS fields into table fields 7872 if self.get_explode_infos(): 7873 self.explode_infos( 7874 prefix=self.get_explode_infos_prefix(), 7875 fields=self.get_explode_infos_fields(), 7876 force=True, 7877 ) 7878 7879 return True 7880 7881 ### 7882 # HGVS 7883 ### 7884 7885 def annotation_hgvs(self, threads: int = None) -> None: 7886 """ 7887 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7888 coordinates and alleles. 7889 7890 :param threads: The `threads` parameter is an optional integer that specifies the number of 7891 threads to use for parallel processing. If no value is provided, it will default to the number 7892 of threads obtained from the `get_threads()` method 7893 :type threads: int 7894 """ 7895 7896 # Function for each partition of the Dask Dataframe 7897 def partition_function(partition): 7898 """ 7899 The function `partition_function` applies the `annotation_hgvs_partition` function to 7900 each row of a DataFrame called `partition`. 7901 7902 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7903 to be processed 7904 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7905 the "partition" dataframe along the axis 1. 7906 """ 7907 return partition.apply(annotation_hgvs_partition, axis=1) 7908 7909 def annotation_hgvs_partition(row) -> str: 7910 """ 7911 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7912 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7913 7914 :param row: A dictionary-like object that contains the values for the following keys: 7915 :return: a string that contains the HGVS names associated with the given row of data. 7916 """ 7917 7918 chr = row["CHROM"] 7919 pos = row["POS"] 7920 ref = row["REF"] 7921 alt = row["ALT"] 7922 7923 # Find list of associated transcripts 7924 transcripts_list = list( 7925 polars_conn.execute( 7926 f""" 7927 SELECT transcript 7928 FROM refseq_df 7929 WHERE CHROM='{chr}' 7930 AND POS={pos} 7931 """ 7932 )["transcript"] 7933 ) 7934 7935 # Full HGVS annotation in list 7936 hgvs_full_list = [] 7937 7938 for transcript_name in transcripts_list: 7939 7940 # Transcript 7941 transcript = get_transcript( 7942 transcripts=transcripts, transcript_name=transcript_name 7943 ) 7944 # Exon 7945 if use_exon: 7946 exon = transcript.find_exon_number(pos) 7947 else: 7948 exon = None 7949 # Protein 7950 transcript_protein = None 7951 if use_protein or add_protein or full_format: 7952 transcripts_protein = list( 7953 polars_conn.execute( 7954 f""" 7955 SELECT protein 7956 FROM refseqlink_df 7957 WHERE transcript='{transcript_name}' 7958 LIMIT 1 7959 """ 7960 )["protein"] 7961 ) 7962 if len(transcripts_protein): 7963 transcript_protein = transcripts_protein[0] 7964 7965 # HGVS name 7966 hgvs_name = format_hgvs_name( 7967 chr, 7968 pos, 7969 ref, 7970 alt, 7971 genome=genome, 7972 transcript=transcript, 7973 transcript_protein=transcript_protein, 7974 exon=exon, 7975 use_gene=use_gene, 7976 use_protein=use_protein, 7977 full_format=full_format, 7978 use_version=use_version, 7979 codon_type=codon_type, 7980 ) 7981 hgvs_full_list.append(hgvs_name) 7982 if add_protein and not use_protein and not full_format: 7983 hgvs_name = format_hgvs_name( 7984 chr, 7985 pos, 7986 ref, 7987 alt, 7988 genome=genome, 7989 transcript=transcript, 7990 transcript_protein=transcript_protein, 7991 exon=exon, 7992 use_gene=use_gene, 7993 use_protein=True, 7994 full_format=False, 7995 use_version=use_version, 7996 codon_type=codon_type, 7997 ) 7998 hgvs_full_list.append(hgvs_name) 7999 8000 # Create liste of HGVS annotations 8001 hgvs_full = ",".join(hgvs_full_list) 8002 8003 return hgvs_full 8004 8005 # Polars connexion 8006 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8007 8008 # Config 8009 config = self.get_config() 8010 8011 # Databases 8012 # Genome 8013 databases_genomes_folders = ( 8014 config.get("folders", {}) 8015 .get("databases", {}) 8016 .get("genomes", DEFAULT_GENOME_FOLDER) 8017 ) 8018 databases_genome = ( 8019 config.get("folders", {}).get("databases", {}).get("genomes", "") 8020 ) 8021 # refseq database folder 8022 databases_refseq_folders = ( 8023 config.get("folders", {}) 8024 .get("databases", {}) 8025 .get("refseq", DEFAULT_REFSEQ_FOLDER) 8026 ) 8027 # refseq 8028 databases_refseq = config.get("databases", {}).get("refSeq", None) 8029 # refSeqLink 8030 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 8031 8032 # Param 8033 param = self.get_param() 8034 8035 # Quick HGVS 8036 if "hgvs_options" in param and param.get("hgvs_options", ""): 8037 log.info(f"Quick HGVS Annotation:") 8038 if not param.get("hgvs", None): 8039 param["hgvs"] = {} 8040 for option in param.get("hgvs_options", "").split(","): 8041 option_var_val = option.split("=") 8042 option_var = option_var_val[0] 8043 if len(option_var_val) > 1: 8044 option_val = option_var_val[1] 8045 else: 8046 option_val = "True" 8047 if option_val.upper() in ["TRUE"]: 8048 option_val = True 8049 elif option_val.upper() in ["FALSE"]: 8050 option_val = False 8051 log.info(f" {option_var}={option_val}") 8052 param["hgvs"][option_var] = option_val 8053 8054 # Check if HGVS annotation enabled 8055 if "hgvs" in param: 8056 log.info(f"HGVS Annotation... ") 8057 for hgvs_option in param.get("hgvs", {}): 8058 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 8059 else: 8060 return 8061 8062 # HGVS Param 8063 param_hgvs = param.get("hgvs", {}) 8064 use_exon = param_hgvs.get("use_exon", False) 8065 use_gene = param_hgvs.get("use_gene", False) 8066 use_protein = param_hgvs.get("use_protein", False) 8067 add_protein = param_hgvs.get("add_protein", False) 8068 full_format = param_hgvs.get("full_format", False) 8069 use_version = param_hgvs.get("use_version", False) 8070 codon_type = param_hgvs.get("codon_type", "3") 8071 8072 # refSseq refSeqLink 8073 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8074 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8075 8076 # Assembly 8077 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8078 8079 # Genome 8080 genome_file = None 8081 if find_genome(databases_genome): 8082 genome_file = find_genome(databases_genome) 8083 else: 8084 genome_file = find_genome( 8085 genome_path=databases_genomes_folders, assembly=assembly 8086 ) 8087 log.debug("Genome: " + str(genome_file)) 8088 8089 # refSseq 8090 refseq_file = find_file_prefix( 8091 input_file=databases_refseq, 8092 prefix="ncbiRefSeq", 8093 folder=databases_refseq_folders, 8094 assembly=assembly, 8095 ) 8096 log.debug("refSeq: " + str(refseq_file)) 8097 8098 # refSeqLink 8099 refseqlink_file = find_file_prefix( 8100 input_file=databases_refseqlink, 8101 prefix="ncbiRefSeqLink", 8102 folder=databases_refseq_folders, 8103 assembly=assembly, 8104 ) 8105 log.debug("refSeqLink: " + str(refseqlink_file)) 8106 8107 # Threads 8108 if not threads: 8109 threads = self.get_threads() 8110 log.debug("Threads: " + str(threads)) 8111 8112 # Variables 8113 table_variants = self.get_table_variants(clause="update") 8114 8115 # Get variants SNV and InDel only 8116 query_variants = f""" 8117 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8118 FROM {table_variants} 8119 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8120 """ 8121 df_variants = self.get_query_to_df(query_variants) 8122 8123 # Added columns 8124 added_columns = [] 8125 8126 # Add hgvs column in variants table 8127 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8128 added_column = self.add_column( 8129 table_variants, hgvs_column_name, "STRING", default_value=None 8130 ) 8131 added_columns.append(added_column) 8132 8133 log.debug(f"refSeq loading...") 8134 # refSeq in duckDB 8135 refseq_table = get_refseq_table( 8136 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8137 ) 8138 # Loading all refSeq in Dataframe 8139 refseq_query = f""" 8140 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8141 FROM {refseq_table} 8142 JOIN df_variants ON ( 8143 {refseq_table}.chrom = df_variants.CHROM 8144 AND {refseq_table}.txStart<=df_variants.POS 8145 AND {refseq_table}.txEnd>=df_variants.POS 8146 ) 8147 """ 8148 refseq_df = self.conn.query(refseq_query).pl() 8149 8150 if refseqlink_file: 8151 log.debug(f"refSeqLink loading...") 8152 # refSeqLink in duckDB 8153 refseqlink_table = get_refseq_table( 8154 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8155 ) 8156 # Loading all refSeqLink in Dataframe 8157 protacc_column = "protAcc_with_ver" 8158 mrnaacc_column = "mrnaAcc_with_ver" 8159 refseqlink_query = f""" 8160 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8161 FROM {refseqlink_table} 8162 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8163 WHERE protAcc_without_ver IS NOT NULL 8164 """ 8165 # Polars Dataframe 8166 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8167 8168 # Read RefSeq transcripts into a python dict/model. 8169 log.debug(f"Transcripts loading...") 8170 with tempfile.TemporaryDirectory() as tmpdir: 8171 transcripts_query = f""" 8172 COPY ( 8173 SELECT {refseq_table}.* 8174 FROM {refseq_table} 8175 JOIN df_variants ON ( 8176 {refseq_table}.chrom=df_variants.CHROM 8177 AND {refseq_table}.txStart<=df_variants.POS 8178 AND {refseq_table}.txEnd>=df_variants.POS 8179 ) 8180 ) 8181 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8182 """ 8183 self.conn.query(transcripts_query) 8184 with open(f"{tmpdir}/transcript.tsv") as infile: 8185 transcripts = read_transcripts(infile) 8186 8187 # Polars connexion 8188 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8189 8190 log.debug("Genome loading...") 8191 # Read genome sequence using pyfaidx. 8192 genome = Fasta(genome_file) 8193 8194 log.debug("Start annotation HGVS...") 8195 8196 # Create 8197 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8198 ddf = dd.from_pandas(df_variants, npartitions=threads) 8199 8200 # Use dask.dataframe.apply() to apply function on each partition 8201 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8202 8203 # Convert Dask DataFrame to Pandas Dataframe 8204 df = ddf.compute() 8205 8206 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8207 with tempfile.TemporaryDirectory() as tmpdir: 8208 df_parquet = os.path.join(tmpdir, "df.parquet") 8209 df.to_parquet(df_parquet) 8210 8211 # Update hgvs column 8212 update_variant_query = f""" 8213 UPDATE {table_variants} 8214 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8215 FROM read_parquet('{df_parquet}') as df 8216 WHERE variants."#CHROM" = df.CHROM 8217 AND variants.POS = df.POS 8218 AND variants.REF = df.REF 8219 AND variants.ALT = df.ALT 8220 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8221 """ 8222 self.execute_query(update_variant_query) 8223 8224 # Update INFO column 8225 sql_query_update = f""" 8226 UPDATE {table_variants} 8227 SET INFO = 8228 concat( 8229 CASE 8230 WHEN INFO NOT IN ('','.') 8231 THEN concat(INFO, ';') 8232 ELSE '' 8233 END, 8234 'hgvs=', 8235 {hgvs_column_name} 8236 ) 8237 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8238 """ 8239 self.execute_query(sql_query_update) 8240 8241 # Add header 8242 HGVS_INFOS = { 8243 "hgvs": { 8244 "ID": "hgvs", 8245 "Number": ".", 8246 "Type": "String", 8247 "Description": f"HGVS annotatation with HOWARD", 8248 } 8249 } 8250 8251 for field in HGVS_INFOS: 8252 field_ID = HGVS_INFOS[field]["ID"] 8253 field_description = HGVS_INFOS[field]["Description"] 8254 self.get_header().infos[field_ID] = vcf.parser._Info( 8255 field_ID, 8256 HGVS_INFOS[field]["Number"], 8257 HGVS_INFOS[field]["Type"], 8258 field_description, 8259 "unknown", 8260 "unknown", 8261 code_type_map[HGVS_INFOS[field]["Type"]], 8262 ) 8263 8264 # Remove added columns 8265 for added_column in added_columns: 8266 self.drop_column(column=added_column) 8267 8268 ### 8269 # Calculation 8270 ### 8271 8272 def get_operations_help( 8273 self, operations_config_dict: dict = {}, operations_config_file: str = None 8274 ) -> list: 8275 8276 # Init 8277 operations_help = [] 8278 8279 # operations 8280 operations = self.get_config_json( 8281 name="calculations", 8282 config_dict=operations_config_dict, 8283 config_file=operations_config_file, 8284 ) 8285 for op in operations: 8286 op_name = operations[op].get("name", op).upper() 8287 op_description = operations[op].get("description", op_name) 8288 op_available = operations[op].get("available", False) 8289 if op_available: 8290 operations_help.append(f" {op_name}: {op_description}") 8291 8292 # Sort operations 8293 operations_help.sort() 8294 8295 # insert header 8296 operations_help.insert(0, "Available calculation operations:") 8297 8298 # Return 8299 return operations_help 8300 8301 def calculation( 8302 self, 8303 operations: dict = {}, 8304 operations_config_dict: dict = {}, 8305 operations_config_file: str = None, 8306 ) -> None: 8307 """ 8308 It takes a list of operations, and for each operation, it checks if it's a python or sql 8309 operation, and then calls the appropriate function 8310 8311 param json example: 8312 "calculation": { 8313 "NOMEN": { 8314 "options": { 8315 "hgvs_field": "hgvs" 8316 }, 8317 "middle" : null 8318 } 8319 """ 8320 8321 # Param 8322 param = self.get_param() 8323 8324 # CHeck operations config file 8325 if operations_config_file is None: 8326 operations_config_file = param.get("calculation", {}).get( 8327 "calculation_config", None 8328 ) 8329 8330 # operations config 8331 operations_config = self.get_config_json( 8332 name="calculations", 8333 config_dict=operations_config_dict, 8334 config_file=operations_config_file, 8335 ) 8336 8337 # Upper keys 8338 operations_config = {k.upper(): v for k, v in operations_config.items()} 8339 8340 # Calculations 8341 8342 # Operations from param 8343 operations = param.get("calculation", {}).get("calculations", operations) 8344 8345 # Quick calculation - add 8346 if param.get("calculations", None): 8347 8348 # List of operations 8349 calculations_list = [ 8350 value.strip() for value in param.get("calculations", "").split(",") 8351 ] 8352 8353 # Log 8354 log.info(f"Quick Calculations:") 8355 for calculation_key in calculations_list: 8356 log.info(f" {calculation_key}") 8357 8358 # Create tmp operations (to keep operation order) 8359 operations_tmp = {} 8360 for calculation_operation in calculations_list: 8361 if calculation_operation.upper() not in operations_tmp: 8362 log.debug( 8363 f"{calculation_operation}.upper() not in {operations_tmp}" 8364 ) 8365 operations_tmp[calculation_operation.upper()] = {} 8366 add_value_into_dict( 8367 dict_tree=operations_tmp, 8368 sections=[ 8369 calculation_operation.upper(), 8370 ], 8371 value=operations.get(calculation_operation.upper(), {}), 8372 ) 8373 # Add operations already in param 8374 for calculation_operation in operations: 8375 if calculation_operation not in operations_tmp: 8376 operations_tmp[calculation_operation] = operations.get( 8377 calculation_operation, {} 8378 ) 8379 8380 # Update operations in param 8381 operations = operations_tmp 8382 8383 # Operations for calculation 8384 if not operations: 8385 operations = param.get("calculation", {}).get("calculations", {}) 8386 8387 if operations: 8388 log.info(f"Calculations...") 8389 8390 # For each operations 8391 for operation_name in operations: 8392 operation_name = operation_name.upper() 8393 if operation_name not in [""]: 8394 if operation_name in operations_config: 8395 log.info(f"Calculation '{operation_name}'") 8396 operation = operations_config[operation_name] 8397 operation_type = operation.get("type", "sql") 8398 if operation_type == "python": 8399 self.calculation_process_function( 8400 operation=operation, operation_name=operation_name 8401 ) 8402 elif operation_type == "sql": 8403 self.calculation_process_sql( 8404 operation=operation, operation_name=operation_name 8405 ) 8406 else: 8407 log.error( 8408 f"Operations config: Type '{operation_type}' NOT available" 8409 ) 8410 raise ValueError( 8411 f"Operations config: Type '{operation_type}' NOT available" 8412 ) 8413 else: 8414 log.error( 8415 f"Operations config: Calculation '{operation_name}' NOT available" 8416 ) 8417 raise ValueError( 8418 f"Operations config: Calculation '{operation_name}' NOT available" 8419 ) 8420 8421 # Explode INFOS fields into table fields 8422 if self.get_explode_infos(): 8423 self.explode_infos( 8424 prefix=self.get_explode_infos_prefix(), 8425 fields=self.get_explode_infos_fields(), 8426 force=True, 8427 ) 8428 8429 def calculation_process_sql( 8430 self, operation: dict, operation_name: str = "unknown" 8431 ) -> None: 8432 """ 8433 The `calculation_process_sql` function takes in a mathematical operation as a string and 8434 performs the operation, updating the specified table with the result. 8435 8436 :param operation: The `operation` parameter is a dictionary that contains information about the 8437 mathematical operation to be performed. It includes the following keys: 8438 :type operation: dict 8439 :param operation_name: The `operation_name` parameter is a string that represents the name of 8440 the mathematical operation being performed. It is used for logging and error handling purposes, 8441 defaults to unknown 8442 :type operation_name: str (optional) 8443 """ 8444 8445 # Operation infos 8446 operation_name = operation.get("name", "unknown") 8447 log.debug(f"process SQL {operation_name}") 8448 output_column_name = operation.get("output_column_name", operation_name) 8449 output_column_type = operation.get("output_column_type", "String") 8450 prefix = operation.get("explode_infos_prefix", "") 8451 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8452 output_column_description = operation.get( 8453 "output_column_description", f"{operation_name} operation" 8454 ) 8455 operation_query = operation.get("operation_query", None) 8456 if isinstance(operation_query, list): 8457 operation_query = " ".join(operation_query) 8458 operation_info_fields = operation.get("info_fields", []) 8459 operation_info_fields_check = operation.get("info_fields_check", False) 8460 operation_info = operation.get("operation_info", True) 8461 operation_table = operation.get( 8462 "table", self.get_table_variants(clause="alter") 8463 ) 8464 8465 # table variants 8466 if operation_table: 8467 table_variants = operation_table 8468 else: 8469 table_variants = self.get_table_variants(clause="alter") 8470 8471 if operation_query: 8472 8473 # Info fields check 8474 operation_info_fields_check_result = True 8475 if operation_info_fields_check: 8476 header_infos = self.get_header().infos 8477 for info_field in operation_info_fields: 8478 operation_info_fields_check_result = ( 8479 operation_info_fields_check_result 8480 and info_field in header_infos 8481 ) 8482 8483 # If info fields available 8484 if operation_info_fields_check_result: 8485 8486 # Added_columns 8487 added_columns = [] 8488 8489 # Create VCF header field 8490 vcf_reader = self.get_header() 8491 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8492 output_column_name, 8493 ".", 8494 output_column_type, 8495 output_column_description, 8496 "howard calculation", 8497 "0", 8498 self.code_type_map.get(output_column_type), 8499 ) 8500 8501 # Explode infos if needed 8502 log.debug(f"calculation_process_sql prefix {prefix}") 8503 added_columns += self.explode_infos( 8504 prefix=prefix, 8505 fields=[output_column_name] + operation_info_fields, 8506 force=False, 8507 table=table_variants, 8508 ) 8509 8510 # Create column 8511 added_column = self.add_column( 8512 table_name=table_variants, 8513 column_name=prefix + output_column_name, 8514 column_type=output_column_type_sql, 8515 default_value="null", 8516 ) 8517 added_columns.append(added_column) 8518 8519 # Operation calculation 8520 try: 8521 8522 # Query to update calculation column 8523 sql_update = f""" 8524 UPDATE {table_variants} 8525 SET "{prefix}{output_column_name}" = ({operation_query}) 8526 """ 8527 self.conn.execute(sql_update) 8528 8529 # Add to INFO 8530 if operation_info: 8531 sql_update_info = f""" 8532 UPDATE {table_variants} 8533 SET "INFO" = 8534 concat( 8535 CASE 8536 WHEN "INFO" IS NOT NULL 8537 THEN concat("INFO", ';') 8538 ELSE '' 8539 END, 8540 '{output_column_name}=', 8541 "{prefix}{output_column_name}" 8542 ) 8543 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8544 """ 8545 self.conn.execute(sql_update_info) 8546 8547 except: 8548 log.error( 8549 f"Operations config: Calculation '{operation_name}' query failed" 8550 ) 8551 raise ValueError( 8552 f"Operations config: Calculation '{operation_name}' query failed" 8553 ) 8554 8555 # Remove added columns 8556 for added_column in added_columns: 8557 log.debug(f"added_column: {added_column}") 8558 self.drop_column(column=added_column) 8559 8560 else: 8561 log.error( 8562 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8563 ) 8564 raise ValueError( 8565 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8566 ) 8567 8568 else: 8569 log.error( 8570 f"Operations config: Calculation '{operation_name}' query NOT defined" 8571 ) 8572 raise ValueError( 8573 f"Operations config: Calculation '{operation_name}' query NOT defined" 8574 ) 8575 8576 def calculation_process_function( 8577 self, operation: dict, operation_name: str = "unknown" 8578 ) -> None: 8579 """ 8580 The `calculation_process_function` takes in an operation dictionary and performs the specified 8581 function with the given parameters. 8582 8583 :param operation: The `operation` parameter is a dictionary that contains information about the 8584 operation to be performed. It has the following keys: 8585 :type operation: dict 8586 :param operation_name: The `operation_name` parameter is a string that represents the name of 8587 the operation being performed. It is used for logging purposes, defaults to unknown 8588 :type operation_name: str (optional) 8589 """ 8590 8591 operation_name = operation["name"] 8592 log.debug(f"process Python {operation_name}") 8593 function_name = operation["function_name"] 8594 function_params = operation["function_params"] 8595 getattr(self, function_name)(*function_params) 8596 8597 def calculation_variant_id(self) -> None: 8598 """ 8599 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8600 updates the INFO field of a variants table with the variant ID. 8601 """ 8602 8603 # variant_id annotation field 8604 variant_id_tag = self.get_variant_id_column() 8605 added_columns = [variant_id_tag] 8606 8607 # variant_id hgvs tags" 8608 vcf_infos_tags = { 8609 variant_id_tag: "howard variant ID annotation", 8610 } 8611 8612 # Variants table 8613 table_variants = self.get_table_variants() 8614 8615 # Header 8616 vcf_reader = self.get_header() 8617 8618 # Add variant_id to header 8619 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8620 variant_id_tag, 8621 ".", 8622 "String", 8623 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8624 "howard calculation", 8625 "0", 8626 self.code_type_map.get("String"), 8627 ) 8628 8629 # Update 8630 sql_update = f""" 8631 UPDATE {table_variants} 8632 SET "INFO" = 8633 concat( 8634 CASE 8635 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8636 THEN '' 8637 ELSE concat("INFO", ';') 8638 END, 8639 '{variant_id_tag}=', 8640 "{variant_id_tag}" 8641 ) 8642 """ 8643 self.conn.execute(sql_update) 8644 8645 # Remove added columns 8646 for added_column in added_columns: 8647 self.drop_column(column=added_column) 8648 8649 def calculation_extract_snpeff_hgvs( 8650 self, 8651 snpeff_hgvs: str = "snpeff_hgvs", 8652 snpeff_field: str = "ANN", 8653 ) -> None: 8654 """ 8655 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8656 annotation field in a VCF file and adds them as a new column in the variants table. 8657 8658 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8659 function is used to specify the name of the column that will store the HGVS nomenclatures 8660 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8661 snpeff_hgvs 8662 :type snpeff_hgvs: str (optional) 8663 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8664 function represents the field in the VCF file that contains SnpEff annotations. This field is 8665 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8666 to ANN 8667 :type snpeff_field: str (optional) 8668 """ 8669 8670 # Snpeff hgvs tags 8671 vcf_infos_tags = { 8672 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8673 } 8674 8675 # Prefix 8676 prefix = self.get_explode_infos_prefix() 8677 if prefix: 8678 prefix = "INFO/" 8679 8680 # snpEff fields 8681 speff_ann_infos = prefix + snpeff_field 8682 speff_hgvs_infos = prefix + snpeff_hgvs 8683 8684 # Variants table 8685 table_variants = self.get_table_variants() 8686 8687 # Header 8688 vcf_reader = self.get_header() 8689 8690 # Add columns 8691 added_columns = [] 8692 8693 # Explode HGVS field in column 8694 added_columns += self.explode_infos(fields=[snpeff_field]) 8695 8696 if snpeff_field in vcf_reader.infos: 8697 8698 log.debug(vcf_reader.infos[snpeff_field]) 8699 8700 # Extract ANN header 8701 ann_description = vcf_reader.infos[snpeff_field].desc 8702 pattern = r"'(.+?)'" 8703 match = re.search(pattern, ann_description) 8704 if match: 8705 ann_header_match = match.group(1).split(" | ") 8706 ann_header_desc = {} 8707 for i in range(len(ann_header_match)): 8708 ann_header_info = "".join( 8709 char for char in ann_header_match[i] if char.isalnum() 8710 ) 8711 ann_header_desc[ann_header_info] = ann_header_match[i] 8712 if not ann_header_desc: 8713 raise ValueError("Invalid header description format") 8714 else: 8715 raise ValueError("Invalid header description format") 8716 8717 # Create variant id 8718 variant_id_column = self.get_variant_id_column() 8719 added_columns += [variant_id_column] 8720 8721 # Create dataframe 8722 dataframe_snpeff_hgvs = self.get_query_to_df( 8723 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8724 ) 8725 8726 # Create main NOMEN column 8727 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8728 speff_ann_infos 8729 ].apply( 8730 lambda x: extract_snpeff_hgvs( 8731 str(x), header=list(ann_header_desc.values()) 8732 ) 8733 ) 8734 8735 # Add snpeff_hgvs to header 8736 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8737 snpeff_hgvs, 8738 ".", 8739 "String", 8740 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8741 "howard calculation", 8742 "0", 8743 self.code_type_map.get("String"), 8744 ) 8745 8746 # Update 8747 sql_update = f""" 8748 UPDATE variants 8749 SET "INFO" = 8750 concat( 8751 CASE 8752 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8753 THEN '' 8754 ELSE concat("INFO", ';') 8755 END, 8756 CASE 8757 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8758 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8759 THEN concat( 8760 '{snpeff_hgvs}=', 8761 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8762 ) 8763 ELSE '' 8764 END 8765 ) 8766 FROM dataframe_snpeff_hgvs 8767 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8768 8769 """ 8770 self.conn.execute(sql_update) 8771 8772 # Delete dataframe 8773 del dataframe_snpeff_hgvs 8774 gc.collect() 8775 8776 else: 8777 8778 log.warning( 8779 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8780 ) 8781 8782 # Remove added columns 8783 for added_column in added_columns: 8784 self.drop_column(column=added_column) 8785 8786 def calculation_snpeff_ann_explode( 8787 self, 8788 uniquify: bool = True, 8789 output_format: str = "fields", 8790 output_prefix: str = "snpeff_", 8791 snpeff_field: str = "ANN", 8792 ) -> None: 8793 """ 8794 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8795 exploding the HGVS field and updating variant information accordingly. 8796 8797 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8798 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8799 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8800 defaults to True 8801 :type uniquify: bool (optional) 8802 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8803 function specifies the format in which the output annotations will be generated. It has a 8804 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8805 format, defaults to fields 8806 :type output_format: str (optional) 8807 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8808 method is used to specify the prefix that will be added to the output annotations generated 8809 during the calculation process. This prefix helps to differentiate the newly added annotations 8810 from existing ones in the output data. By default, the, defaults to ANN_ 8811 :type output_prefix: str (optional) 8812 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8813 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8814 field will be processed to explode the HGVS annotations and update the variant information 8815 accordingly, defaults to ANN 8816 :type snpeff_field: str (optional) 8817 """ 8818 8819 # SnpEff annotation field 8820 snpeff_hgvs = "snpeff_ann_explode" 8821 8822 # Snpeff hgvs tags 8823 vcf_infos_tags = { 8824 snpeff_hgvs: "Explode snpEff annotations", 8825 } 8826 8827 # Prefix 8828 prefix = self.get_explode_infos_prefix() 8829 if prefix: 8830 prefix = "INFO/" 8831 8832 # snpEff fields 8833 speff_ann_infos = prefix + snpeff_field 8834 speff_hgvs_infos = prefix + snpeff_hgvs 8835 8836 # Variants table 8837 table_variants = self.get_table_variants() 8838 8839 # Header 8840 vcf_reader = self.get_header() 8841 8842 # Add columns 8843 added_columns = [] 8844 8845 # Explode HGVS field in column 8846 added_columns += self.explode_infos(fields=[snpeff_field]) 8847 log.debug(f"snpeff_field={snpeff_field}") 8848 log.debug(f"added_columns={added_columns}") 8849 8850 if snpeff_field in vcf_reader.infos: 8851 8852 # Extract ANN header 8853 ann_description = vcf_reader.infos[snpeff_field].desc 8854 pattern = r"'(.+?)'" 8855 match = re.search(pattern, ann_description) 8856 if match: 8857 ann_header_match = match.group(1).split(" | ") 8858 ann_header = [] 8859 ann_header_desc = {} 8860 for i in range(len(ann_header_match)): 8861 ann_header_info = "".join( 8862 char for char in ann_header_match[i] if char.isalnum() 8863 ) 8864 ann_header.append(ann_header_info) 8865 ann_header_desc[ann_header_info] = ann_header_match[i] 8866 if not ann_header_desc: 8867 raise ValueError("Invalid header description format") 8868 else: 8869 raise ValueError("Invalid header description format") 8870 8871 # Create variant id 8872 variant_id_column = self.get_variant_id_column() 8873 added_columns += [variant_id_column] 8874 8875 # Create dataframe 8876 dataframe_snpeff_hgvs = self.get_query_to_df( 8877 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8878 ) 8879 8880 # Create snpEff columns 8881 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8882 speff_ann_infos 8883 ].apply( 8884 lambda x: explode_snpeff_ann( 8885 str(x), 8886 uniquify=uniquify, 8887 output_format=output_format, 8888 prefix=output_prefix, 8889 header=list(ann_header_desc.values()), 8890 ) 8891 ) 8892 8893 # Header 8894 ann_annotations_prefix = "" 8895 if output_format.upper() in ["JSON"]: 8896 ann_annotations_prefix = f"{output_prefix}=" 8897 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8898 output_prefix, 8899 ".", 8900 "String", 8901 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8902 + " - JSON format", 8903 "howard calculation", 8904 "0", 8905 self.code_type_map.get("String"), 8906 ) 8907 else: 8908 for ann_annotation in ann_header: 8909 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8910 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8911 ann_annotation_id, 8912 ".", 8913 "String", 8914 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8915 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8916 "howard calculation", 8917 "0", 8918 self.code_type_map.get("String"), 8919 ) 8920 8921 # Update 8922 sql_update = f""" 8923 UPDATE variants 8924 SET "INFO" = 8925 concat( 8926 CASE 8927 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8928 THEN '' 8929 ELSE concat("INFO", ';') 8930 END, 8931 CASE 8932 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8933 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8934 THEN concat( 8935 '{ann_annotations_prefix}', 8936 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8937 ) 8938 ELSE '' 8939 END 8940 ) 8941 FROM dataframe_snpeff_hgvs 8942 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8943 8944 """ 8945 self.conn.execute(sql_update) 8946 8947 # Delete dataframe 8948 del dataframe_snpeff_hgvs 8949 gc.collect() 8950 8951 else: 8952 8953 log.warning( 8954 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8955 ) 8956 8957 # Remove added columns 8958 for added_column in added_columns: 8959 self.drop_column(column=added_column) 8960 8961 def calculation_extract_nomen(self) -> None: 8962 """ 8963 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8964 """ 8965 8966 # NOMEN field 8967 field_nomen_dict = "NOMEN_DICT" 8968 8969 # NOMEN structure 8970 nomen_dict = { 8971 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8972 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8973 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8974 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8975 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8976 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8977 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8978 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8979 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8980 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8981 } 8982 8983 # Param 8984 param = self.get_param() 8985 8986 # Threads 8987 threads = self.get_threads() 8988 8989 # Prefix 8990 prefix = self.get_explode_infos_prefix() 8991 8992 # Header 8993 vcf_reader = self.get_header() 8994 8995 # Added columns 8996 added_columns = [] 8997 8998 # Get HGVS field 8999 hgvs_field = ( 9000 param.get("calculation", {}) 9001 .get("calculations", {}) 9002 .get("NOMEN", {}) 9003 .get("options", {}) 9004 .get("hgvs_field", "hgvs") 9005 ) 9006 9007 # Get NOMEN pattern 9008 nomen_pattern = ( 9009 param.get("calculation", {}) 9010 .get("calculations", {}) 9011 .get("NOMEN", {}) 9012 .get("options", {}) 9013 .get("pattern", None) 9014 ) 9015 9016 # transcripts list of preference sources 9017 transcripts_sources = {} 9018 9019 # Get transcripts 9020 transcripts_file = ( 9021 param.get("calculation", {}) 9022 .get("calculations", {}) 9023 .get("NOMEN", {}) 9024 .get("options", {}) 9025 .get("transcripts", None) 9026 ) 9027 transcripts_file = full_path(transcripts_file) 9028 if transcripts_file: 9029 if os.path.exists(transcripts_file): 9030 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 9031 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 9032 transcripts_sources["file"] = transcripts_from_file 9033 else: 9034 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 9035 log.error(msg_err) 9036 raise ValueError(msg_err) 9037 9038 # Get transcripts table 9039 transcripts_table = ( 9040 param.get("calculation", {}) 9041 .get("calculations", {}) 9042 .get("NOMEN", {}) 9043 .get("options", {}) 9044 .get("transcripts_table", self.get_table_variants()) 9045 ) 9046 # Get transcripts column 9047 transcripts_column = ( 9048 param.get("calculation", {}) 9049 .get("calculations", {}) 9050 .get("NOMEN", {}) 9051 .get("options", {}) 9052 .get("transcripts_column", None) 9053 ) 9054 9055 if transcripts_table and transcripts_column: 9056 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 9057 # Explode if not exists 9058 added_columns += self.explode_infos( 9059 fields=[transcripts_column], table=transcripts_table 9060 ) 9061 else: 9062 extra_field_transcript = f"NULL" 9063 9064 # Transcripts of preference source order 9065 transcripts_order = ( 9066 param.get("calculation", {}) 9067 .get("calculations", {}) 9068 .get("NOMEN", {}) 9069 .get("options", {}) 9070 .get("transcripts_order", ["column", "file"]) 9071 ) 9072 9073 # Transcripts from file 9074 transcripts = transcripts_sources.get("file", []) 9075 9076 # Explode HGVS field in column 9077 added_columns += self.explode_infos(fields=[hgvs_field]) 9078 9079 # extra infos 9080 extra_infos = self.get_extra_infos() 9081 extra_field = prefix + hgvs_field 9082 9083 if extra_field in extra_infos: 9084 9085 # Create dataframe 9086 dataframe_hgvs = self.get_query_to_df( 9087 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9088 ) 9089 9090 # Transcripts rank 9091 transcripts_rank = { 9092 transcript: rank for rank, transcript in enumerate(transcripts, start=1) 9093 } 9094 transcripts_len = len(transcripts_rank) 9095 9096 # Create main NOMEN column 9097 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9098 lambda x: find_nomen( 9099 hgvs=x.hgvs, 9100 transcript=x.transcript, 9101 transcripts=transcripts_rank, 9102 pattern=nomen_pattern, 9103 transcripts_source_order=transcripts_order, 9104 transcripts_len=transcripts_len, 9105 ), 9106 axis=1, 9107 ) 9108 9109 # Explode NOMEN Structure and create SQL set for update 9110 sql_nomen_fields = [] 9111 for nomen_field in nomen_dict: 9112 9113 # Create VCF header field 9114 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9115 nomen_field, 9116 ".", 9117 "String", 9118 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9119 "howard calculation", 9120 "0", 9121 self.code_type_map.get("String"), 9122 ) 9123 9124 # Add field to SQL query update 9125 sql_nomen_fields.append( 9126 f""" 9127 CASE 9128 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9129 THEN concat( 9130 ';{nomen_field}=', 9131 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9132 ) 9133 ELSE '' 9134 END 9135 """ 9136 ) 9137 9138 # SQL set for update 9139 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9140 9141 # Update 9142 sql_update = f""" 9143 UPDATE variants 9144 SET "INFO" = 9145 concat( 9146 CASE 9147 WHEN "INFO" IS NULL 9148 THEN '' 9149 ELSE "INFO" 9150 END, 9151 {sql_nomen_fields_set} 9152 ) 9153 FROM dataframe_hgvs 9154 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9155 AND variants."POS" = dataframe_hgvs."POS" 9156 AND variants."REF" = dataframe_hgvs."REF" 9157 AND variants."ALT" = dataframe_hgvs."ALT" 9158 """ 9159 self.conn.execute(sql_update) 9160 9161 # Delete dataframe 9162 del dataframe_hgvs 9163 gc.collect() 9164 9165 # Remove added columns 9166 for added_column in added_columns: 9167 self.drop_column(column=added_column) 9168 9169 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9170 """ 9171 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9172 pipeline/sample for a variant and updates the variant information in a VCF file. 9173 9174 :param tag: The `tag` parameter is a string that represents the annotation field for the 9175 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9176 VCF header and to update the corresponding field in the variants table, defaults to 9177 findbypipeline 9178 :type tag: str (optional) 9179 """ 9180 9181 # if FORMAT and samples 9182 if ( 9183 "FORMAT" in self.get_header_columns_as_list() 9184 and self.get_header_sample_list() 9185 ): 9186 9187 # findbypipeline annotation field 9188 findbypipeline_tag = tag 9189 9190 # VCF infos tags 9191 vcf_infos_tags = { 9192 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9193 } 9194 9195 # Prefix 9196 prefix = self.get_explode_infos_prefix() 9197 9198 # Field 9199 findbypipeline_infos = prefix + findbypipeline_tag 9200 9201 # Variants table 9202 table_variants = self.get_table_variants() 9203 9204 # Header 9205 vcf_reader = self.get_header() 9206 9207 # Create variant id 9208 variant_id_column = self.get_variant_id_column() 9209 added_columns = [variant_id_column] 9210 9211 # variant_id, FORMAT and samples 9212 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9213 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9214 ) 9215 9216 # Create dataframe 9217 dataframe_findbypipeline = self.get_query_to_df( 9218 f""" SELECT {samples_fields} FROM {table_variants} """ 9219 ) 9220 9221 # Create findbypipeline column 9222 dataframe_findbypipeline[findbypipeline_infos] = ( 9223 dataframe_findbypipeline.apply( 9224 lambda row: findbypipeline( 9225 row, samples=self.get_header_sample_list() 9226 ), 9227 axis=1, 9228 ) 9229 ) 9230 9231 # Add snpeff_hgvs to header 9232 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9233 findbypipeline_tag, 9234 ".", 9235 "String", 9236 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9237 "howard calculation", 9238 "0", 9239 self.code_type_map.get("String"), 9240 ) 9241 9242 # Update 9243 sql_update = f""" 9244 UPDATE variants 9245 SET "INFO" = 9246 concat( 9247 CASE 9248 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9249 THEN '' 9250 ELSE concat("INFO", ';') 9251 END, 9252 CASE 9253 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9254 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9255 THEN concat( 9256 '{findbypipeline_tag}=', 9257 dataframe_findbypipeline."{findbypipeline_infos}" 9258 ) 9259 ELSE '' 9260 END 9261 ) 9262 FROM dataframe_findbypipeline 9263 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9264 """ 9265 self.conn.execute(sql_update) 9266 9267 # Remove added columns 9268 for added_column in added_columns: 9269 self.drop_column(column=added_column) 9270 9271 # Delete dataframe 9272 del dataframe_findbypipeline 9273 gc.collect() 9274 9275 def calculation_genotype_concordance(self) -> None: 9276 """ 9277 The function `calculation_genotype_concordance` calculates the genotype concordance for 9278 multi-caller VCF files and updates the variant information in the database. 9279 """ 9280 9281 # if FORMAT and samples 9282 if ( 9283 "FORMAT" in self.get_header_columns_as_list() 9284 and self.get_header_sample_list() 9285 ): 9286 9287 # genotypeconcordance annotation field 9288 genotypeconcordance_tag = "genotypeconcordance" 9289 9290 # VCF infos tags 9291 vcf_infos_tags = { 9292 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9293 } 9294 9295 # Prefix 9296 prefix = self.get_explode_infos_prefix() 9297 9298 # Field 9299 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9300 9301 # Variants table 9302 table_variants = self.get_table_variants() 9303 9304 # Header 9305 vcf_reader = self.get_header() 9306 9307 # Create variant id 9308 variant_id_column = self.get_variant_id_column() 9309 added_columns = [variant_id_column] 9310 9311 # variant_id, FORMAT and samples 9312 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9313 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9314 ) 9315 9316 # Create dataframe 9317 dataframe_genotypeconcordance = self.get_query_to_df( 9318 f""" SELECT {samples_fields} FROM {table_variants} """ 9319 ) 9320 9321 # Create genotypeconcordance column 9322 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9323 dataframe_genotypeconcordance.apply( 9324 lambda row: genotypeconcordance( 9325 row, samples=self.get_header_sample_list() 9326 ), 9327 axis=1, 9328 ) 9329 ) 9330 9331 # Add genotypeconcordance to header 9332 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9333 genotypeconcordance_tag, 9334 ".", 9335 "String", 9336 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9337 "howard calculation", 9338 "0", 9339 self.code_type_map.get("String"), 9340 ) 9341 9342 # Update 9343 sql_update = f""" 9344 UPDATE variants 9345 SET "INFO" = 9346 concat( 9347 CASE 9348 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9349 THEN '' 9350 ELSE concat("INFO", ';') 9351 END, 9352 CASE 9353 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9354 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9355 THEN concat( 9356 '{genotypeconcordance_tag}=', 9357 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9358 ) 9359 ELSE '' 9360 END 9361 ) 9362 FROM dataframe_genotypeconcordance 9363 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9364 """ 9365 self.conn.execute(sql_update) 9366 9367 # Remove added columns 9368 for added_column in added_columns: 9369 self.drop_column(column=added_column) 9370 9371 # Delete dataframe 9372 del dataframe_genotypeconcordance 9373 gc.collect() 9374 9375 def calculation_barcode(self, tag: str = "barcode") -> None: 9376 """ 9377 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9378 updates the INFO field in the file with the calculated barcode values. 9379 9380 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9381 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9382 the default tag name is set to "barcode", defaults to barcode 9383 :type tag: str (optional) 9384 """ 9385 9386 # if FORMAT and samples 9387 if ( 9388 "FORMAT" in self.get_header_columns_as_list() 9389 and self.get_header_sample_list() 9390 ): 9391 9392 # barcode annotation field 9393 if not tag: 9394 tag = "barcode" 9395 9396 # VCF infos tags 9397 vcf_infos_tags = { 9398 tag: "barcode calculation (VaRank)", 9399 } 9400 9401 # Prefix 9402 prefix = self.get_explode_infos_prefix() 9403 9404 # Field 9405 barcode_infos = prefix + tag 9406 9407 # Variants table 9408 table_variants = self.get_table_variants() 9409 9410 # Header 9411 vcf_reader = self.get_header() 9412 9413 # Create variant id 9414 variant_id_column = self.get_variant_id_column() 9415 added_columns = [variant_id_column] 9416 9417 # variant_id, FORMAT and samples 9418 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9419 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9420 ) 9421 9422 # Create dataframe 9423 dataframe_barcode = self.get_query_to_df( 9424 f""" SELECT {samples_fields} FROM {table_variants} """ 9425 ) 9426 9427 # Create barcode column 9428 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9429 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9430 ) 9431 9432 # Add barcode to header 9433 vcf_reader.infos[tag] = vcf.parser._Info( 9434 tag, 9435 ".", 9436 "String", 9437 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9438 "howard calculation", 9439 "0", 9440 self.code_type_map.get("String"), 9441 ) 9442 9443 # Update 9444 sql_update = f""" 9445 UPDATE {table_variants} 9446 SET "INFO" = 9447 concat( 9448 CASE 9449 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9450 THEN '' 9451 ELSE concat("INFO", ';') 9452 END, 9453 CASE 9454 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9455 AND dataframe_barcode."{barcode_infos}" NOT NULL 9456 THEN concat( 9457 '{tag}=', 9458 dataframe_barcode."{barcode_infos}" 9459 ) 9460 ELSE '' 9461 END 9462 ) 9463 FROM dataframe_barcode 9464 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9465 """ 9466 self.conn.execute(sql_update) 9467 9468 # Remove added columns 9469 for added_column in added_columns: 9470 self.drop_column(column=added_column) 9471 9472 # Delete dataframe 9473 del dataframe_barcode 9474 gc.collect() 9475 9476 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9477 """ 9478 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9479 and updates the INFO field in the file with the calculated barcode values. 9480 9481 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9482 the barcode tag that will be added to the VCF file during the calculation process. If no value 9483 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9484 :type tag: str (optional) 9485 """ 9486 9487 # if FORMAT and samples 9488 if ( 9489 "FORMAT" in self.get_header_columns_as_list() 9490 and self.get_header_sample_list() 9491 ): 9492 9493 # barcode annotation field 9494 if not tag: 9495 tag = "BCF" 9496 9497 # VCF infos tags 9498 vcf_infos_tags = { 9499 tag: "barcode family calculation", 9500 f"{tag}S": "barcode family samples", 9501 } 9502 9503 # Param 9504 param = self.get_param() 9505 log.debug(f"param={param}") 9506 9507 # Prefix 9508 prefix = self.get_explode_infos_prefix() 9509 9510 # PED param 9511 ped = ( 9512 param.get("calculation", {}) 9513 .get("calculations", {}) 9514 .get("BARCODEFAMILY", {}) 9515 .get("family_pedigree", None) 9516 ) 9517 log.debug(f"ped={ped}") 9518 9519 # Load PED 9520 if ped: 9521 9522 # Pedigree is a file 9523 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9524 log.debug("Pedigree is file") 9525 with open(full_path(ped)) as ped: 9526 ped = yaml.safe_load(ped) 9527 9528 # Pedigree is a string 9529 elif isinstance(ped, str): 9530 log.debug("Pedigree is str") 9531 try: 9532 ped = json.loads(ped) 9533 log.debug("Pedigree is json str") 9534 except ValueError as e: 9535 ped_samples = ped.split(",") 9536 ped = {} 9537 for ped_sample in ped_samples: 9538 ped[ped_sample] = ped_sample 9539 9540 # Pedigree is a dict 9541 elif isinstance(ped, dict): 9542 log.debug("Pedigree is dict") 9543 9544 # Pedigree is not well formatted 9545 else: 9546 msg_error = "Pedigree not well formatted" 9547 log.error(msg_error) 9548 raise ValueError(msg_error) 9549 9550 # Construct list 9551 ped_samples = list(ped.values()) 9552 9553 else: 9554 log.debug("Pedigree not defined. Take all samples") 9555 ped_samples = self.get_header_sample_list() 9556 ped = {} 9557 for ped_sample in ped_samples: 9558 ped[ped_sample] = ped_sample 9559 9560 # Check pedigree 9561 if not ped or len(ped) == 0: 9562 msg_error = f"Error in pedigree: samples {ped_samples}" 9563 log.error(msg_error) 9564 raise ValueError(msg_error) 9565 9566 # Log 9567 log.info( 9568 "Calculation 'BARCODEFAMILY' - Samples: " 9569 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9570 ) 9571 log.debug(f"ped_samples={ped_samples}") 9572 9573 # Field 9574 barcode_infos = prefix + tag 9575 9576 # Variants table 9577 table_variants = self.get_table_variants() 9578 9579 # Header 9580 vcf_reader = self.get_header() 9581 9582 # Create variant id 9583 variant_id_column = self.get_variant_id_column() 9584 added_columns = [variant_id_column] 9585 9586 # variant_id, FORMAT and samples 9587 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9588 [f""" "{sample}" """ for sample in ped_samples] 9589 ) 9590 9591 # Create dataframe 9592 dataframe_barcode = self.get_query_to_df( 9593 f""" SELECT {samples_fields} FROM {table_variants} """ 9594 ) 9595 9596 # Create barcode column 9597 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9598 lambda row: barcode(row, samples=ped_samples), axis=1 9599 ) 9600 9601 # Add barcode family to header 9602 # Add vaf_normalization to header 9603 vcf_reader.formats[tag] = vcf.parser._Format( 9604 id=tag, 9605 num=".", 9606 type="String", 9607 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9608 type_code=self.code_type_map.get("String"), 9609 ) 9610 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9611 id=f"{tag}S", 9612 num=".", 9613 type="String", 9614 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9615 type_code=self.code_type_map.get("String"), 9616 ) 9617 9618 # Update 9619 # for sample in ped_samples: 9620 sql_update_set = [] 9621 for sample in self.get_header_sample_list() + ["FORMAT"]: 9622 if sample in ped_samples: 9623 value = f'dataframe_barcode."{barcode_infos}"' 9624 value_samples = ( 9625 "'" 9626 + ",".join([f""" "{sample}" """ for sample in ped_samples]) 9627 + "'" 9628 ) 9629 ped_samples 9630 elif sample == "FORMAT": 9631 value = f"'{tag}'" 9632 value_samples = f"'{tag}S'" 9633 else: 9634 value = "'.'" 9635 value_samples = "'.'" 9636 format_regex = r"[a-zA-Z0-9\s]" 9637 sql_update_set.append( 9638 f""" 9639 "{sample}" = 9640 concat( 9641 CASE 9642 WHEN {table_variants}."{sample}" = './.' 9643 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9644 ELSE {table_variants}."{sample}" 9645 END, 9646 ':', 9647 {value}, 9648 ':', 9649 {value_samples} 9650 ) 9651 """ 9652 ) 9653 9654 sql_update_set_join = ", ".join(sql_update_set) 9655 sql_update = f""" 9656 UPDATE {table_variants} 9657 SET {sql_update_set_join} 9658 FROM dataframe_barcode 9659 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9660 """ 9661 self.conn.execute(sql_update) 9662 9663 # Remove added columns 9664 for added_column in added_columns: 9665 self.drop_column(column=added_column) 9666 9667 # Delete dataframe 9668 del dataframe_barcode 9669 gc.collect() 9670 9671 def calculation_trio(self) -> None: 9672 """ 9673 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9674 information to the INFO field of each variant. 9675 """ 9676 9677 # if FORMAT and samples 9678 if ( 9679 "FORMAT" in self.get_header_columns_as_list() 9680 and self.get_header_sample_list() 9681 ): 9682 9683 # trio annotation field 9684 trio_tag = "trio" 9685 9686 # VCF infos tags 9687 vcf_infos_tags = { 9688 "trio": "trio calculation", 9689 } 9690 9691 # Param 9692 param = self.get_param() 9693 9694 # Prefix 9695 prefix = self.get_explode_infos_prefix() 9696 9697 # Trio param 9698 trio_ped = ( 9699 param.get("calculation", {}) 9700 .get("calculations", {}) 9701 .get("TRIO", {}) 9702 .get("trio_pedigree", None) 9703 ) 9704 9705 # Load trio 9706 if trio_ped: 9707 9708 # Trio pedigree is a file 9709 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9710 log.debug("TRIO pedigree is file") 9711 with open(full_path(trio_ped)) as trio_ped: 9712 trio_ped = yaml.safe_load(trio_ped) 9713 9714 # Trio pedigree is a string 9715 elif isinstance(trio_ped, str): 9716 log.debug("TRIO pedigree is str") 9717 try: 9718 trio_ped = json.loads(trio_ped) 9719 log.debug("TRIO pedigree is json str") 9720 except ValueError as e: 9721 trio_samples = trio_ped.split(",") 9722 if len(trio_samples) == 3: 9723 trio_ped = { 9724 "father": trio_samples[0], 9725 "mother": trio_samples[1], 9726 "child": trio_samples[2], 9727 } 9728 log.debug("TRIO pedigree is list str") 9729 else: 9730 msg_error = "TRIO pedigree not well formatted" 9731 log.error(msg_error) 9732 raise ValueError(msg_error) 9733 9734 # Trio pedigree is a dict 9735 elif isinstance(trio_ped, dict): 9736 log.debug("TRIO pedigree is dict") 9737 9738 # Trio pedigree is not well formatted 9739 else: 9740 msg_error = "TRIO pedigree not well formatted" 9741 log.error(msg_error) 9742 raise ValueError(msg_error) 9743 9744 # Construct trio list 9745 trio_samples = [ 9746 trio_ped.get("father", ""), 9747 trio_ped.get("mother", ""), 9748 trio_ped.get("child", ""), 9749 ] 9750 9751 else: 9752 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9753 samples_list = self.get_header_sample_list() 9754 if len(samples_list) >= 3: 9755 trio_samples = self.get_header_sample_list()[0:3] 9756 trio_ped = { 9757 "father": trio_samples[0], 9758 "mother": trio_samples[1], 9759 "child": trio_samples[2], 9760 } 9761 else: 9762 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9763 log.error(msg_error) 9764 raise ValueError(msg_error) 9765 9766 # Check trio pedigree 9767 if not trio_ped or len(trio_ped) != 3: 9768 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9769 log.error(msg_error) 9770 raise ValueError(msg_error) 9771 9772 # Log 9773 log.info( 9774 f"Calculation 'TRIO' - Samples: " 9775 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9776 ) 9777 9778 # Field 9779 trio_infos = prefix + trio_tag 9780 9781 # Variants table 9782 table_variants = self.get_table_variants() 9783 9784 # Header 9785 vcf_reader = self.get_header() 9786 9787 # Create variant id 9788 variant_id_column = self.get_variant_id_column() 9789 added_columns = [variant_id_column] 9790 9791 # variant_id, FORMAT and samples 9792 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9793 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9794 ) 9795 9796 # Create dataframe 9797 dataframe_trio = self.get_query_to_df( 9798 f""" SELECT {samples_fields} FROM {table_variants} """ 9799 ) 9800 9801 # Create trio column 9802 dataframe_trio[trio_infos] = dataframe_trio.apply( 9803 lambda row: trio(row, samples=trio_samples), axis=1 9804 ) 9805 9806 # Add trio to header 9807 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9808 trio_tag, 9809 ".", 9810 "String", 9811 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9812 "howard calculation", 9813 "0", 9814 self.code_type_map.get("String"), 9815 ) 9816 9817 # Update 9818 sql_update = f""" 9819 UPDATE {table_variants} 9820 SET "INFO" = 9821 concat( 9822 CASE 9823 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9824 THEN '' 9825 ELSE concat("INFO", ';') 9826 END, 9827 CASE 9828 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9829 AND dataframe_trio."{trio_infos}" NOT NULL 9830 THEN concat( 9831 '{trio_tag}=', 9832 dataframe_trio."{trio_infos}" 9833 ) 9834 ELSE '' 9835 END 9836 ) 9837 FROM dataframe_trio 9838 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9839 """ 9840 self.conn.execute(sql_update) 9841 9842 # Remove added columns 9843 for added_column in added_columns: 9844 self.drop_column(column=added_column) 9845 9846 # Delete dataframe 9847 del dataframe_trio 9848 gc.collect() 9849 9850 def calculation_vaf_normalization(self) -> None: 9851 """ 9852 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9853 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9854 :return: The function does not return anything. 9855 """ 9856 9857 # if FORMAT and samples 9858 if ( 9859 "FORMAT" in self.get_header_columns_as_list() 9860 and self.get_header_sample_list() 9861 ): 9862 9863 # vaf_normalization annotation field 9864 vaf_normalization_tag = "VAF" 9865 9866 # VCF infos tags 9867 vcf_infos_tags = { 9868 "VAF": "VAF Variant Frequency", 9869 } 9870 9871 # Prefix 9872 prefix = self.get_explode_infos_prefix() 9873 9874 # Variants table 9875 table_variants = self.get_table_variants() 9876 9877 # Header 9878 vcf_reader = self.get_header() 9879 9880 # Do not calculate if VAF already exists 9881 if "VAF" in vcf_reader.formats: 9882 log.debug("VAF already on genotypes") 9883 return 9884 9885 # Create variant id 9886 variant_id_column = self.get_variant_id_column() 9887 added_columns = [variant_id_column] 9888 9889 # variant_id, FORMAT and samples 9890 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9891 f""" "{sample}" """ for sample in self.get_header_sample_list() 9892 ) 9893 9894 # Create dataframe 9895 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9896 log.debug(f"query={query}") 9897 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9898 9899 vaf_normalization_set = [] 9900 9901 # for each sample vaf_normalization 9902 for sample in self.get_header_sample_list(): 9903 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9904 lambda row: vaf_normalization(row, sample=sample), axis=1 9905 ) 9906 vaf_normalization_set.append( 9907 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9908 ) 9909 9910 # Add VAF to FORMAT 9911 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9912 "FORMAT" 9913 ].apply(lambda x: str(x) + ":VAF") 9914 vaf_normalization_set.append( 9915 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9916 ) 9917 9918 # Add vaf_normalization to header 9919 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9920 id=vaf_normalization_tag, 9921 num="1", 9922 type="Float", 9923 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9924 type_code=self.code_type_map.get("Float"), 9925 ) 9926 9927 # Create fields to add in INFO 9928 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9929 9930 # Update 9931 sql_update = f""" 9932 UPDATE {table_variants} 9933 SET {sql_vaf_normalization_set} 9934 FROM dataframe_vaf_normalization 9935 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9936 9937 """ 9938 self.conn.execute(sql_update) 9939 9940 # Remove added columns 9941 for added_column in added_columns: 9942 self.drop_column(column=added_column) 9943 9944 # Delete dataframe 9945 del dataframe_vaf_normalization 9946 gc.collect() 9947 9948 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9949 """ 9950 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9951 field in a VCF file and updates the INFO column of the variants table with the calculated 9952 statistics. 9953 9954 :param info: The `info` parameter is a string that represents the type of information for which 9955 genotype statistics are calculated. It is used to generate various VCF info tags for the 9956 statistics, such as the number of occurrences, the list of values, the minimum value, the 9957 maximum value, the mean, the median, defaults to VAF 9958 :type info: str (optional) 9959 """ 9960 9961 # if FORMAT and samples 9962 if ( 9963 "FORMAT" in self.get_header_columns_as_list() 9964 and self.get_header_sample_list() 9965 ): 9966 9967 # vaf_stats annotation field 9968 vaf_stats_tag = info + "_stats" 9969 9970 # VCF infos tags 9971 vcf_infos_tags = { 9972 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9973 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9974 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9975 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9976 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9977 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9978 info 9979 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9980 } 9981 9982 # Prefix 9983 prefix = self.get_explode_infos_prefix() 9984 9985 # Field 9986 vaf_stats_infos = prefix + vaf_stats_tag 9987 9988 # Variants table 9989 table_variants = self.get_table_variants() 9990 9991 # Header 9992 vcf_reader = self.get_header() 9993 9994 # Create variant id 9995 variant_id_column = self.get_variant_id_column() 9996 added_columns = [variant_id_column] 9997 9998 # variant_id, FORMAT and samples 9999 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 10000 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 10001 ) 10002 10003 # Create dataframe 10004 dataframe_vaf_stats = self.get_query_to_df( 10005 f""" SELECT {samples_fields} FROM {table_variants} """ 10006 ) 10007 10008 # Create vaf_stats column 10009 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 10010 lambda row: genotype_stats( 10011 row, samples=self.get_header_sample_list(), info=info 10012 ), 10013 axis=1, 10014 ) 10015 10016 # List of vcf tags 10017 sql_vaf_stats_fields = [] 10018 10019 # Check all VAF stats infos 10020 for stat in vcf_infos_tags: 10021 10022 # Extract stats 10023 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 10024 lambda x: dict(x).get(stat, "") 10025 ) 10026 10027 # Add snpeff_hgvs to header 10028 vcf_reader.infos[stat] = vcf.parser._Info( 10029 stat, 10030 ".", 10031 "String", 10032 vcf_infos_tags.get(stat, "genotype statistics"), 10033 "howard calculation", 10034 "0", 10035 self.code_type_map.get("String"), 10036 ) 10037 10038 if len(sql_vaf_stats_fields): 10039 sep = ";" 10040 else: 10041 sep = "" 10042 10043 # Create fields to add in INFO 10044 sql_vaf_stats_fields.append( 10045 f""" 10046 CASE 10047 WHEN dataframe_vaf_stats."{stat}" NOT NULL 10048 THEN concat( 10049 '{sep}{stat}=', 10050 dataframe_vaf_stats."{stat}" 10051 ) 10052 ELSE '' 10053 END 10054 """ 10055 ) 10056 10057 # SQL set for update 10058 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 10059 10060 # Update 10061 sql_update = f""" 10062 UPDATE {table_variants} 10063 SET "INFO" = 10064 concat( 10065 CASE 10066 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10067 THEN '' 10068 ELSE concat("INFO", ';') 10069 END, 10070 {sql_vaf_stats_fields_set} 10071 ) 10072 FROM dataframe_vaf_stats 10073 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 10074 10075 """ 10076 self.conn.execute(sql_update) 10077 10078 # Remove added columns 10079 for added_column in added_columns: 10080 self.drop_column(column=added_column) 10081 10082 # Delete dataframe 10083 del dataframe_vaf_stats 10084 gc.collect() 10085 10086 def calculation_transcripts_annotation( 10087 self, info_json: str = None, info_format: str = None 10088 ) -> None: 10089 """ 10090 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10091 field to it if transcripts are available. 10092 10093 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10094 is a string parameter that represents the information field to be used in the transcripts JSON. 10095 It is used to specify the JSON format for the transcripts information. If no value is provided 10096 when calling the method, it defaults to " 10097 :type info_json: str 10098 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10099 method is a string parameter that specifies the format of the information field to be used in 10100 the transcripts JSON. It is used to define the format of the information field 10101 :type info_format: str 10102 """ 10103 10104 # Create transcripts table 10105 transcripts_table = self.create_transcript_view() 10106 10107 # Add info field 10108 if transcripts_table: 10109 self.transcript_view_to_variants( 10110 transcripts_table=transcripts_table, 10111 transcripts_info_field_json=info_json, 10112 transcripts_info_field_format=info_format, 10113 ) 10114 else: 10115 log.info("No Transcripts to process. Check param.json file configuration") 10116 10117 def calculation_transcripts_prioritization(self) -> None: 10118 """ 10119 The function `calculation_transcripts_prioritization` creates a transcripts table and 10120 prioritizes transcripts based on certain criteria. 10121 """ 10122 10123 # Create transcripts table 10124 transcripts_table = self.create_transcript_view() 10125 10126 # Add info field 10127 if transcripts_table: 10128 self.transcripts_prioritization(transcripts_table=transcripts_table) 10129 else: 10130 log.info("No Transcripts to process. Check param.json file configuration") 10131 10132 def calculation_transcripts_export(self) -> None: 10133 """ """ 10134 10135 # Create transcripts table 10136 transcripts_table = self.create_transcript_view() 10137 10138 # Add info field 10139 if transcripts_table: 10140 self.transcripts_export(transcripts_table=transcripts_table) 10141 else: 10142 log.info("No Transcripts to process. Check param.json file configuration") 10143 10144 ############### 10145 # Transcripts # 10146 ############### 10147 10148 def transcripts_export( 10149 self, transcripts_table: str = None, param: dict = {} 10150 ) -> bool: 10151 """ """ 10152 10153 log.debug("Start transcripts export...") 10154 10155 # Param 10156 if not param: 10157 param = self.get_param() 10158 10159 # Param export 10160 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10161 10162 # Output file 10163 transcripts_export_output = param_transcript_export.get("output", None) 10164 10165 if not param_transcript_export or not transcripts_export_output: 10166 log.warning(f"No transcriipts export parameters defined!") 10167 return False 10168 10169 # List of transcripts annotations 10170 query_describe = f""" 10171 SELECT column_name 10172 FROM ( 10173 DESCRIBE SELECT * FROM {transcripts_table} 10174 ) 10175 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10176 """ 10177 transcripts_annotations_list = list( 10178 self.get_query_to_df(query=query_describe)["column_name"] 10179 ) 10180 10181 # Create transcripts table for export 10182 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10183 random.choices(string.ascii_uppercase + string.digits, k=10) 10184 ) 10185 query_create_transcripts_table_export = f""" 10186 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10187 """ 10188 self.execute_query(query=query_create_transcripts_table_export) 10189 10190 # Output file format 10191 transcripts_export_output_format = get_file_format( 10192 filename=transcripts_export_output 10193 ) 10194 10195 # Format VCF - construct INFO 10196 if transcripts_export_output_format in ["vcf"]: 10197 10198 # Construct query update INFO and header 10199 query_update_info = [] 10200 for field in transcripts_annotations_list: 10201 10202 # If field not in header 10203 if field not in self.get_header_infos_list(): 10204 10205 # Add PZ Transcript in header 10206 self.get_header().infos[field] = vcf.parser._Info( 10207 field, 10208 ".", 10209 "String", 10210 f"Annotation '{field}' from transcript view", 10211 "unknown", 10212 "unknown", 10213 0, 10214 ) 10215 10216 # Add field as INFO/tag 10217 query_update_info.append( 10218 f""" 10219 CASE 10220 WHEN "{field}" IS NOT NULL 10221 THEN concat('{field}=', "{field}", ';') 10222 ELSE '' 10223 END 10224 """ 10225 ) 10226 10227 # Query param 10228 query_update_info_value = ( 10229 f""" concat('', {", ".join(query_update_info)}) """ 10230 ) 10231 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10232 10233 else: 10234 10235 # Query param 10236 query_update_info_value = f""" NULL """ 10237 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10238 10239 # Update query INFO column 10240 query_update = f""" 10241 UPDATE {transcripts_table_export} 10242 SET INFO = {query_update_info_value} 10243 10244 """ 10245 self.execute_query(query=query_update) 10246 10247 # Export 10248 self.export_output( 10249 output_file=transcripts_export_output, 10250 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10251 ) 10252 10253 # Drop transcripts export table 10254 query_drop_transcripts_table_export = f""" 10255 DROP TABLE {transcripts_table_export} 10256 """ 10257 self.execute_query(query=query_drop_transcripts_table_export) 10258 10259 def transcripts_prioritization( 10260 self, transcripts_table: str = None, param: dict = {} 10261 ) -> bool: 10262 """ 10263 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10264 and updates the variants table with the prioritized information. 10265 10266 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10267 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10268 This parameter is used to identify the table where the transcripts data is stored for the 10269 prioritization process 10270 :type transcripts_table: str 10271 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10272 that contains various configuration settings for the prioritization process of transcripts. It 10273 is used to customize the behavior of the prioritization algorithm and includes settings such as 10274 the prefix for prioritization fields, default profiles, and other 10275 :type param: dict 10276 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10277 transcripts prioritization process is successfully completed, and `False` if there are any 10278 issues or if no profile is defined for transcripts prioritization. 10279 """ 10280 10281 log.debug("Start transcripts prioritization...") 10282 10283 # Param 10284 if not param: 10285 param = self.get_param() 10286 10287 # Variants table 10288 table_variants = self.get_table_variants() 10289 10290 # Transcripts table 10291 if transcripts_table is None: 10292 transcripts_table = self.create_transcript_view( 10293 transcripts_table="transcripts", param=param 10294 ) 10295 if transcripts_table is None: 10296 msg_err = "No Transcripts table availalble" 10297 log.error(msg_err) 10298 raise ValueError(msg_err) 10299 log.debug(f"transcripts_table={transcripts_table}") 10300 10301 # Get transcripts columns 10302 columns_as_list_query = f""" 10303 DESCRIBE {transcripts_table} 10304 """ 10305 columns_as_list = list( 10306 self.get_query_to_df(columns_as_list_query)["column_name"] 10307 ) 10308 10309 # Create INFO if not exists 10310 if "INFO" not in columns_as_list: 10311 query_add_info = f""" 10312 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10313 """ 10314 self.execute_query(query_add_info) 10315 10316 # Prioritization param and Force only PZ Score and Flag 10317 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10318 10319 # PZ profile by default 10320 pz_profile_default = ( 10321 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10322 ) 10323 10324 # Exit if no profile 10325 if pz_profile_default is None: 10326 log.warning("No profile defined for transcripts prioritization") 10327 return False 10328 10329 # PZ fields 10330 pz_param_pzfields = {} 10331 10332 # PZ field transcripts 10333 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10334 10335 # Add PZ Transcript in header 10336 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10337 pz_fields_transcripts, 10338 ".", 10339 "String", 10340 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10341 "unknown", 10342 "unknown", 10343 code_type_map["String"], 10344 ) 10345 10346 # Mandatory fields if asked in param 10347 pz_mandatory_fields_list = [ 10348 "Score", 10349 "Flag", 10350 "Tags", 10351 "Comment", 10352 "Infos", 10353 "Class", 10354 ] 10355 pz_mandatory_fields = [] 10356 for pz_mandatory_field in pz_mandatory_fields_list: 10357 pz_mandatory_fields.append( 10358 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10359 ) 10360 10361 # PZ fields in param 10362 pz_param_mandatory_fields = [] 10363 for pz_field in pz_param.get("pzfields", []): 10364 if pz_field in pz_mandatory_fields_list: 10365 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10366 pz_param.get("pzprefix", "PTZ") + pz_field 10367 ) 10368 pz_param_mandatory_fields.append( 10369 pz_param.get("pzprefix", "PTZ") + pz_field 10370 ) 10371 else: 10372 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10373 pz_param_pzfields[pz_field] = pz_field_new 10374 10375 # Add PZ Transcript in header 10376 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10377 pz_field_new, 10378 ".", 10379 "String", 10380 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10381 "unknown", 10382 "unknown", 10383 code_type_map["String"], 10384 ) 10385 10386 # PZ fields param 10387 pz_mandatory_fields = pz_param_mandatory_fields 10388 pz_param["pzfields"] = pz_mandatory_fields 10389 10390 # Prioritization 10391 prioritization_result = self.prioritization( 10392 table=transcripts_table, 10393 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10394 ) 10395 if not prioritization_result: 10396 log.warning("Transcripts prioritization not processed") 10397 return False 10398 10399 # PZ fields sql query 10400 query_update_select_list = [] 10401 query_update_concat_list = [] 10402 query_update_order_list = [] 10403 for pz_param_pzfield in set( 10404 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10405 ): 10406 query_update_select_list.append(f" {pz_param_pzfield}, ") 10407 10408 for pz_param_pzfield in pz_param_pzfields: 10409 query_update_concat_list.append( 10410 f""" 10411 , CASE 10412 WHEN {pz_param_pzfield} IS NOT NULL 10413 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10414 ELSE '' 10415 END 10416 """ 10417 ) 10418 10419 # Order by 10420 pz_orders = ( 10421 param.get("transcripts", {}) 10422 .get("prioritization", {}) 10423 .get("prioritization_transcripts_order", {}) 10424 ) 10425 if not pz_orders: 10426 pz_orders = { 10427 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10428 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10429 } 10430 for pz_order in pz_orders: 10431 query_update_order_list.append( 10432 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10433 ) 10434 10435 # Fields to explode 10436 fields_to_explode = ( 10437 list(pz_param_pzfields.keys()) 10438 + pz_mandatory_fields 10439 + list(pz_orders.keys()) 10440 ) 10441 # Remove transcript column as a specific transcript column 10442 if "transcript" in fields_to_explode: 10443 fields_to_explode.remove("transcript") 10444 10445 # Fields intranscripts table 10446 query_transcripts_table = f""" 10447 DESCRIBE SELECT * FROM {transcripts_table} 10448 """ 10449 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10450 10451 # Check fields to explode 10452 for field_to_explode in fields_to_explode: 10453 if field_to_explode not in self.get_header_infos_list() + list( 10454 query_transcripts_table.column_name 10455 ): 10456 msg_err = f"INFO/{field_to_explode} NOT IN header" 10457 log.error(msg_err) 10458 raise ValueError(msg_err) 10459 10460 # Explode fields to explode 10461 self.explode_infos( 10462 table=transcripts_table, 10463 fields=fields_to_explode, 10464 ) 10465 10466 # Transcript preference file 10467 transcripts_preference_file = ( 10468 param.get("transcripts", {}) 10469 .get("prioritization", {}) 10470 .get("prioritization_transcripts", {}) 10471 ) 10472 transcripts_preference_file = full_path(transcripts_preference_file) 10473 10474 # Transcript preference forced 10475 transcript_preference_force = ( 10476 param.get("transcripts", {}) 10477 .get("prioritization", {}) 10478 .get("prioritization_transcripts_force", False) 10479 ) 10480 # Transcript version forced 10481 transcript_version_force = ( 10482 param.get("transcripts", {}) 10483 .get("prioritization", {}) 10484 .get("prioritization_transcripts_version_force", False) 10485 ) 10486 10487 # Transcripts Ranking 10488 if transcripts_preference_file: 10489 10490 # Transcripts file to dataframe 10491 if os.path.exists(transcripts_preference_file): 10492 transcripts_preference_dataframe = transcripts_file_to_df( 10493 transcripts_preference_file 10494 ) 10495 else: 10496 log.error( 10497 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10498 ) 10499 raise ValueError( 10500 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10501 ) 10502 10503 # Order by depending to transcript preference forcing 10504 if transcript_preference_force: 10505 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10506 else: 10507 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10508 10509 # Transcript columns joined depend on version consideration 10510 if transcript_version_force: 10511 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10512 else: 10513 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10514 10515 # Query ranking for update 10516 query_update_ranking = f""" 10517 SELECT 10518 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10519 ROW_NUMBER() OVER ( 10520 PARTITION BY "#CHROM", POS, REF, ALT 10521 ORDER BY {order_by} 10522 ) AS rn 10523 FROM {transcripts_table} 10524 LEFT JOIN 10525 ( 10526 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10527 FROM transcripts_preference_dataframe 10528 ) AS transcripts_preference 10529 ON {transcripts_version_join} 10530 """ 10531 10532 else: 10533 10534 # Query ranking for update 10535 query_update_ranking = f""" 10536 SELECT 10537 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10538 ROW_NUMBER() OVER ( 10539 PARTITION BY "#CHROM", POS, REF, ALT 10540 ORDER BY {" , ".join(query_update_order_list)} 10541 ) AS rn 10542 FROM {transcripts_table} 10543 """ 10544 10545 # Export Transcripts prioritization infos to variants table 10546 query_update = f""" 10547 WITH RankedTranscripts AS ( 10548 {query_update_ranking} 10549 ) 10550 UPDATE {table_variants} 10551 SET 10552 INFO = CONCAT(CASE 10553 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10554 THEN '' 10555 ELSE concat("INFO", ';') 10556 END, 10557 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10558 ) 10559 FROM 10560 RankedTranscripts 10561 WHERE 10562 rn = 1 10563 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10564 AND variants."POS" = RankedTranscripts."POS" 10565 AND variants."REF" = RankedTranscripts."REF" 10566 AND variants."ALT" = RankedTranscripts."ALT" 10567 """ 10568 10569 # log.debug(f"query_update={query_update}") 10570 self.execute_query(query=query_update) 10571 10572 # Return 10573 return True 10574 10575 def create_transcript_view_from_columns_map( 10576 self, 10577 transcripts_table: str = "transcripts", 10578 columns_maps: dict = {}, 10579 added_columns: list = [], 10580 temporary_tables: list = None, 10581 annotation_fields: list = None, 10582 column_rename: dict = {}, 10583 column_clean: bool = False, 10584 column_case: str = None, 10585 ) -> tuple[list, list, list]: 10586 """ 10587 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10588 specified columns mapping for transcripts data. 10589 10590 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10591 of the table where the transcripts data is stored or will be stored in the database. This table 10592 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10593 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10594 :type transcripts_table: str (optional) 10595 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10596 about how to map columns from a transcripts table to create a view. Each entry in the 10597 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10598 typically includes details such as the main transcript column and additional information columns 10599 :type columns_maps: dict 10600 :param added_columns: The `added_columns` parameter in the 10601 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10602 that will be added to the view being created based on the columns map provided. These columns 10603 are generated by exploding the transcript information columns along with the main transcript 10604 column 10605 :type added_columns: list 10606 :param temporary_tables: The `temporary_tables` parameter in the 10607 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10608 tables created during the process of creating a transcript view from a columns map. These 10609 temporary tables are used to store intermediate results or transformations before the final view 10610 is generated 10611 :type temporary_tables: list 10612 :param annotation_fields: The `annotation_fields` parameter in the 10613 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10614 used for annotation in the query view creation process. These fields are extracted from the 10615 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10616 :type annotation_fields: list 10617 :param column_rename: The `column_rename` parameter in the 10618 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10619 custom renaming for columns during the creation of the temporary table view. This parameter 10620 provides a mapping of original column names to the desired renamed column names. By using this 10621 parameter, 10622 :type column_rename: dict 10623 :param column_clean: The `column_clean` parameter in the 10624 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10625 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10626 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10627 False 10628 :type column_clean: bool (optional) 10629 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10630 function is used to specify the case transformation to be applied to the columns during the view 10631 creation process. It allows you to control whether the column values should be converted to 10632 lowercase, uppercase, or remain unchanged 10633 :type column_case: str 10634 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10635 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10636 """ 10637 10638 log.debug("Start transcrpts view creation from columns map...") 10639 10640 # "from_columns_map": [ 10641 # { 10642 # "transcripts_column": "Ensembl_transcriptid", 10643 # "transcripts_infos_columns": [ 10644 # "genename", 10645 # "Ensembl_geneid", 10646 # "LIST_S2_score", 10647 # "LIST_S2_pred", 10648 # ], 10649 # }, 10650 # { 10651 # "transcripts_column": "Ensembl_transcriptid", 10652 # "transcripts_infos_columns": [ 10653 # "genename", 10654 # "VARITY_R_score", 10655 # "Aloft_pred", 10656 # ], 10657 # }, 10658 # ], 10659 10660 # Init 10661 if temporary_tables is None: 10662 temporary_tables = [] 10663 if annotation_fields is None: 10664 annotation_fields = [] 10665 10666 # Variants table 10667 table_variants = self.get_table_variants() 10668 10669 for columns_map in columns_maps: 10670 10671 # Log 10672 log.debug(f"columns_map={columns_map}") 10673 10674 # Transcript column 10675 transcripts_column = columns_map.get("transcripts_column", None) 10676 10677 # Transcripts infos columns 10678 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10679 10680 # Transcripts infos columns rename 10681 column_rename = columns_map.get("column_rename", column_rename) 10682 10683 # Transcripts infos columns clean 10684 column_clean = columns_map.get("column_clean", column_clean) 10685 10686 # Transcripts infos columns case 10687 column_case = columns_map.get("column_case", column_case) 10688 10689 if transcripts_column is not None: 10690 10691 # Explode 10692 added_columns += self.explode_infos( 10693 fields=[transcripts_column] + transcripts_infos_columns 10694 ) 10695 10696 # View clauses 10697 clause_select_variants = [] 10698 clause_select_tanscripts = [] 10699 for field in [transcripts_column] + transcripts_infos_columns: 10700 10701 # AS field 10702 as_field = field 10703 10704 # Rename 10705 if column_rename: 10706 as_field = column_rename.get(as_field, as_field) 10707 10708 # Clean 10709 if column_clean: 10710 as_field = clean_annotation_field(as_field) 10711 10712 # Case 10713 if column_case: 10714 if column_case.lower() in ["lower"]: 10715 as_field = as_field.lower() 10716 elif column_case.lower() in ["upper"]: 10717 as_field = as_field.upper() 10718 10719 # Clause select Variants 10720 clause_select_variants.append( 10721 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10722 ) 10723 10724 if field in [transcripts_column]: 10725 clause_select_tanscripts.append( 10726 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10727 ) 10728 else: 10729 clause_select_tanscripts.append( 10730 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10731 ) 10732 annotation_fields.append(as_field) 10733 10734 # Query View 10735 query = f""" 10736 SELECT 10737 "#CHROM", POS, REF, ALT, INFO, 10738 "{transcripts_column}" AS 'transcript', 10739 {", ".join(clause_select_tanscripts)} 10740 FROM ( 10741 SELECT 10742 "#CHROM", POS, REF, ALT, INFO, 10743 {", ".join(clause_select_variants)} 10744 FROM {table_variants} 10745 ) 10746 WHERE "{transcripts_column}" IS NOT NULL 10747 """ 10748 10749 # Create temporary table 10750 temporary_table = transcripts_table + "".join( 10751 random.choices(string.ascii_uppercase + string.digits, k=10) 10752 ) 10753 10754 # Temporary view 10755 temporary_tables.append(temporary_table) 10756 query_view = f""" 10757 CREATE view {temporary_table} 10758 AS ({query}) 10759 """ 10760 self.execute_query(query=query_view) 10761 10762 return added_columns, temporary_tables, annotation_fields 10763 10764 def create_transcript_view_from_column_format( 10765 self, 10766 transcripts_table: str = "transcripts", 10767 column_formats: dict = {}, 10768 temporary_tables: list = None, 10769 annotation_fields: list = None, 10770 column_rename: dict = {}, 10771 column_clean: bool = False, 10772 column_case: str = None, 10773 ) -> tuple[list, list, list]: 10774 """ 10775 The `create_transcript_view_from_column_format` function generates a transcript view based on 10776 specified column formats, adds additional columns and annotation fields, and returns the list of 10777 temporary tables and annotation fields. 10778 10779 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10780 of the table containing the transcripts data. This table will be used as the base table for 10781 creating the transcript view. The default value for this parameter is "transcripts", but you can 10782 provide a different table name if needed, defaults to transcripts 10783 :type transcripts_table: str (optional) 10784 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10785 about the columns to be used for creating the transcript view. Each entry in the dictionary 10786 specifies the mapping between a transcripts column and a transcripts infos column. This 10787 parameter allows you to define how the columns from the transcripts table should be transformed 10788 or mapped 10789 :type column_formats: dict 10790 :param temporary_tables: The `temporary_tables` parameter in the 10791 `create_transcript_view_from_column_format` function is a list that stores the names of 10792 temporary views created during the process of creating a transcript view from a column format. 10793 These temporary views are used to manipulate and extract data before generating the final 10794 transcript view 10795 :type temporary_tables: list 10796 :param annotation_fields: The `annotation_fields` parameter in the 10797 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10798 that are extracted from the temporary views created during the process. These annotation fields 10799 are obtained by querying the temporary views and extracting the column names excluding specific 10800 columns like `#CH 10801 :type annotation_fields: list 10802 :param column_rename: The `column_rename` parameter in the 10803 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10804 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10805 column names to new column names in this dictionary, you can rename specific columns during the 10806 process 10807 :type column_rename: dict 10808 :param column_clean: The `column_clean` parameter in the 10809 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10810 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10811 will be cleaned during the creation of the transcript view based on the specified column format, 10812 defaults to False 10813 :type column_clean: bool (optional) 10814 :param column_case: The `column_case` parameter in the 10815 `create_transcript_view_from_column_format` function is used to specify the case transformation 10816 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10817 to convert the column names to uppercase or lowercase, respectively 10818 :type column_case: str 10819 :return: The `create_transcript_view_from_column_format` function returns two lists: 10820 `temporary_tables` and `annotation_fields`. 10821 """ 10822 10823 log.debug("Start transcrpts view creation from column format...") 10824 10825 # "from_column_format": [ 10826 # { 10827 # "transcripts_column": "ANN", 10828 # "transcripts_infos_column": "Feature_ID", 10829 # } 10830 # ], 10831 10832 # Init 10833 if temporary_tables is None: 10834 temporary_tables = [] 10835 if annotation_fields is None: 10836 annotation_fields = [] 10837 10838 added_columns = [] 10839 10840 for column_format in column_formats: 10841 10842 # annotation field and transcript annotation field 10843 annotation_field = column_format.get("transcripts_column", "ANN") 10844 transcript_annotation = column_format.get( 10845 "transcripts_infos_column", "Feature_ID" 10846 ) 10847 10848 # Transcripts infos columns rename 10849 column_rename = column_format.get("column_rename", column_rename) 10850 10851 # Transcripts infos columns clean 10852 column_clean = column_format.get("column_clean", column_clean) 10853 10854 # Transcripts infos columns case 10855 column_case = column_format.get("column_case", column_case) 10856 10857 # Temporary View name 10858 temporary_view_name = transcripts_table + "".join( 10859 random.choices(string.ascii_uppercase + string.digits, k=10) 10860 ) 10861 10862 # Create temporary view name 10863 temporary_view_name, added_columns = self.annotation_format_to_table( 10864 annotation_field=annotation_field, 10865 view_name=temporary_view_name, 10866 annotation_id=transcript_annotation, 10867 column_rename=column_rename, 10868 column_clean=column_clean, 10869 column_case=column_case, 10870 ) 10871 10872 # Annotation fields 10873 if temporary_view_name: 10874 query_annotation_fields = f""" 10875 SELECT * 10876 FROM ( 10877 DESCRIBE SELECT * 10878 FROM {temporary_view_name} 10879 ) 10880 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10881 """ 10882 df_annotation_fields = self.get_query_to_df( 10883 query=query_annotation_fields 10884 ) 10885 10886 # Add temporary view and annotation fields 10887 temporary_tables.append(temporary_view_name) 10888 annotation_fields += list(set(df_annotation_fields["column_name"])) 10889 10890 return added_columns, temporary_tables, annotation_fields 10891 10892 def create_transcript_view( 10893 self, 10894 transcripts_table: str = None, 10895 transcripts_table_drop: bool = False, 10896 param: dict = {}, 10897 ) -> str: 10898 """ 10899 The `create_transcript_view` function generates a transcript view by processing data from a 10900 specified table based on provided parameters and structural information. 10901 10902 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10903 is used to specify the name of the table that will store the final transcript view data. If a table 10904 name is not provided, the function will create a new table to store the transcript view data, and by 10905 default,, defaults to transcripts 10906 :type transcripts_table: str (optional) 10907 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10908 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10909 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10910 the function will drop the existing transcripts table if it exists, defaults to False 10911 :type transcripts_table_drop: bool (optional) 10912 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10913 contains information needed to create a transcript view. It includes details such as the structure 10914 of the transcripts, columns mapping, column formats, and other necessary information for generating 10915 the view. This parameter allows for flexibility and customization 10916 :type param: dict 10917 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10918 created or modified during the execution of the function. 10919 """ 10920 10921 log.debug("Start transcripts view creation...") 10922 10923 # Default 10924 transcripts_table_default = "transcripts" 10925 10926 # Param 10927 if not param: 10928 param = self.get_param() 10929 10930 # Struct 10931 struct = param.get("transcripts", {}).get("struct", None) 10932 10933 # Transcript veresion 10934 transcript_id_remove_version = param.get("transcripts", {}).get( 10935 "transcript_id_remove_version", False 10936 ) 10937 10938 # Transcripts mapping 10939 transcript_id_mapping_file = param.get("transcripts", {}).get( 10940 "transcript_id_mapping_file", None 10941 ) 10942 10943 # Transcripts mapping 10944 transcript_id_mapping_force = param.get("transcripts", {}).get( 10945 "transcript_id_mapping_force", None 10946 ) 10947 10948 # Transcripts table 10949 if transcripts_table is None: 10950 transcripts_table = param.get("transcripts", {}).get( 10951 "table", transcripts_table_default 10952 ) 10953 10954 # Check transcripts table exists 10955 if transcripts_table: 10956 10957 # Query to check if transcripts table exists 10958 query_check_table = f""" 10959 SELECT * 10960 FROM information_schema.tables 10961 WHERE table_name = '{transcripts_table}' 10962 """ 10963 df_check_table = self.get_query_to_df(query=query_check_table) 10964 10965 # Check if transcripts table exists 10966 if len(df_check_table) > 0 and not transcripts_table_drop: 10967 log.debug(f"Table {transcripts_table} exists and not drop option") 10968 return transcripts_table 10969 10970 if struct: 10971 10972 # added_columns 10973 added_columns = [] 10974 10975 # Temporary tables 10976 temporary_tables = [] 10977 10978 # Annotation fields 10979 annotation_fields = [] 10980 10981 # from columns map 10982 columns_maps = struct.get("from_columns_map", []) 10983 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10984 self.create_transcript_view_from_columns_map( 10985 transcripts_table=transcripts_table, 10986 columns_maps=columns_maps, 10987 added_columns=added_columns, 10988 temporary_tables=temporary_tables, 10989 annotation_fields=annotation_fields, 10990 ) 10991 ) 10992 added_columns += added_columns_tmp 10993 temporary_tables += temporary_tables_tmp 10994 annotation_fields += annotation_fields_tmp 10995 10996 # from column format 10997 column_formats = struct.get("from_column_format", []) 10998 added_columns, temporary_tables_tmp, annotation_fields_tmp = ( 10999 self.create_transcript_view_from_column_format( 11000 transcripts_table=transcripts_table, 11001 column_formats=column_formats, 11002 temporary_tables=temporary_tables, 11003 annotation_fields=annotation_fields, 11004 ) 11005 ) 11006 added_columns += added_columns_tmp 11007 temporary_tables += temporary_tables_tmp 11008 annotation_fields += annotation_fields_tmp 11009 11010 # Remove some specific fields/column 11011 annotation_fields = list(set(annotation_fields)) 11012 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 11013 if field in annotation_fields: 11014 annotation_fields.remove(field) 11015 11016 # Merge temporary tables query 11017 query_merge = "" 11018 for temporary_table in list(set(temporary_tables)): 11019 11020 # First temporary table 11021 if not query_merge: 11022 query_merge = f""" 11023 SELECT * FROM {temporary_table} 11024 """ 11025 # other temporary table (using UNION) 11026 else: 11027 query_merge += f""" 11028 UNION BY NAME SELECT * FROM {temporary_table} 11029 """ 11030 11031 # transcript table tmp 11032 transcript_table_tmp = "transcripts_tmp" 11033 transcript_table_tmp2 = "transcripts_tmp2" 11034 transcript_table_tmp3 = "transcripts_tmp3" 11035 11036 # Merge on transcript 11037 query_merge_on_transcripts_annotation_fields = [] 11038 11039 # Add transcript list 11040 query_merge_on_transcripts_annotation_fields.append( 11041 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 11042 ) 11043 11044 # Aggregate all annotations fields 11045 for annotation_field in set(annotation_fields): 11046 query_merge_on_transcripts_annotation_fields.append( 11047 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 11048 ) 11049 11050 # Transcripts mapping 11051 if transcript_id_mapping_file: 11052 11053 # Transcript dataframe 11054 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 11055 transcript_id_mapping_dataframe = transcripts_file_to_df( 11056 transcript_id_mapping_file, column_names=["transcript", "alias"] 11057 ) 11058 11059 # Transcript version remove 11060 if transcript_id_remove_version: 11061 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 11062 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 11063 query_left_join = f""" 11064 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11065 """ 11066 else: 11067 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 11068 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 11069 query_left_join = f""" 11070 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11071 """ 11072 11073 # Transcript column for group by merge 11074 query_transcript_merge_group_by = """ 11075 CASE 11076 WHEN transcript_mapped NOT IN ('') 11077 THEN split_part(transcript_mapped, '.', 1) 11078 ELSE split_part(transcript_original, '.', 1) 11079 END 11080 """ 11081 11082 # Merge query 11083 transcripts_tmp2_query = f""" 11084 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 11085 FROM ({query_merge}) AS {transcript_table_tmp} 11086 {query_left_join} 11087 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 11088 """ 11089 11090 # Retrive columns after mege 11091 transcripts_tmp2_describe_query = f""" 11092 DESCRIBE {transcripts_tmp2_query} 11093 """ 11094 transcripts_tmp2_describe_list = list( 11095 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 11096 "column_name" 11097 ] 11098 ) 11099 11100 # Create list of columns for select clause 11101 transcripts_tmp2_describe_select_clause = [] 11102 for field in transcripts_tmp2_describe_list: 11103 if field not in [ 11104 "#CHROM", 11105 "POS", 11106 "REF", 11107 "ALT", 11108 "INFO", 11109 "transcript_mapped", 11110 ]: 11111 as_field = field 11112 if field in ["transcript_original"]: 11113 as_field = "transcripts_mapped" 11114 transcripts_tmp2_describe_select_clause.append( 11115 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11116 ) 11117 11118 # Merge with mapping 11119 query_merge_on_transcripts = f""" 11120 SELECT 11121 "#CHROM", POS, REF, ALT, INFO, 11122 CASE 11123 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11124 THEN ANY_VALUE(transcript_mapped) 11125 ELSE ANY_VALUE(transcript_original) 11126 END AS transcript, 11127 {", ".join(transcripts_tmp2_describe_select_clause)} 11128 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11129 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11130 {query_transcript_merge_group_by} 11131 """ 11132 11133 # Add transcript filter from mapping file 11134 if transcript_id_mapping_force: 11135 query_merge_on_transcripts = f""" 11136 SELECT * 11137 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11138 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11139 """ 11140 11141 # No transcript mapping 11142 else: 11143 11144 # Remove transcript version 11145 if transcript_id_remove_version: 11146 query_transcript_column = f""" 11147 split_part({transcript_table_tmp}.transcript, '.', 1) 11148 """ 11149 else: 11150 query_transcript_column = """ 11151 transcript 11152 """ 11153 11154 # Query sections 11155 query_transcript_column_select = ( 11156 f"{query_transcript_column} AS transcript" 11157 ) 11158 query_transcript_column_group_by = query_transcript_column 11159 11160 # Query for transcripts view 11161 query_merge_on_transcripts = f""" 11162 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11163 FROM ({query_merge}) AS {transcript_table_tmp} 11164 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11165 """ 11166 11167 # Drop transcript view is necessary 11168 if transcripts_table_drop: 11169 query_drop = f""" 11170 DROP TABLE IF EXISTS {transcripts_table}; 11171 """ 11172 self.execute_query(query=query_drop) 11173 11174 # List of unique #CHROM 11175 query_unique_chrom = f""" 11176 SELECT DISTINCT "#CHROM" 11177 FROM variants AS subquery 11178 """ 11179 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11180 11181 # Create table with structure but without data, if not exists 11182 query_create_table = f""" 11183 CREATE TABLE IF NOT EXISTS {transcripts_table} AS 11184 SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0 11185 """ 11186 self.execute_query(query=query_create_table) 11187 11188 # Process by #CHROM 11189 for chrom in unique_chroms["#CHROM"]: 11190 11191 # Log 11192 log.debug(f"Processing #CHROM={chrom}") 11193 11194 # Select data by #CHROM 11195 query_chunk = f""" 11196 SELECT * 11197 FROM ({query_merge_on_transcripts}) 11198 WHERE "#CHROM" = '{chrom}' 11199 """ 11200 11201 # Insert data 11202 query_insert_chunk = f""" 11203 INSERT INTO {transcripts_table} 11204 {query_chunk} 11205 """ 11206 self.execute_query(query=query_insert_chunk) 11207 11208 # Remove temporary tables 11209 if temporary_tables: 11210 for temporary_table in list(set(temporary_tables)): 11211 try: 11212 query_drop_tmp_table = f""" 11213 DROP TABLE IF EXISTS {temporary_table} 11214 """ 11215 self.execute_query(query=query_drop_tmp_table) 11216 except Exception as e: 11217 log.debug(f"'{temporary_table}' Not a table") 11218 try: 11219 query_drop_tmp_table = f""" 11220 DROP VIEW IF EXISTS {temporary_table} 11221 """ 11222 self.execute_query(query=query_drop_tmp_table) 11223 except Exception as e: 11224 log.debug(f"'{temporary_table}' Not a view") 11225 11226 # Remove added columns 11227 for added_column in added_columns: 11228 self.drop_column(column=added_column) 11229 11230 else: 11231 11232 transcripts_table = None 11233 11234 return transcripts_table 11235 11236 def annotation_format_to_table( 11237 self, 11238 annotation_field: str = "ANN", 11239 annotation_id: str = "Feature_ID", 11240 view_name: str = "transcripts", 11241 column_rename: dict = {}, 11242 column_clean: bool = False, 11243 column_case: str = None, 11244 ) -> str: 11245 """ 11246 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11247 structured table format, ensuring unique values and creating a temporary table for further 11248 processing or analysis. 11249 11250 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11251 unique values in the output or not. If set to `True`, the function will make sure that the 11252 output values are unique, defaults to True 11253 :type uniquify: bool (optional) 11254 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11255 that contains the annotation information for each variant. This field is used to extract the 11256 annotation details for further processing in the function. By default, it is set to "ANN", 11257 defaults to ANN 11258 :type annotation_field: str (optional) 11259 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11260 is used to specify the identifier for the annotation feature. This identifier will be used as a 11261 column name in the resulting table or view that is created based on the annotation data. It 11262 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11263 :type annotation_id: str (optional) 11264 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11265 to specify the name of the temporary table that will be created to store the transformed 11266 annotation data. This table will hold the extracted information from the annotation field in a 11267 structured format for further processing or analysis. By default,, defaults to transcripts 11268 :type view_name: str (optional) 11269 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11270 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11271 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11272 created based on the annotation data. This feature enables 11273 :type column_rename: dict 11274 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11275 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11276 If set to `True`, the function will clean the annotation field before further processing. This 11277 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11278 to False 11279 :type column_clean: bool (optional) 11280 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11281 used to specify the case transformation to be applied to the column names extracted from the 11282 annotation data. It allows you to set the case of the column names to either lowercase or 11283 uppercase for consistency or other specific requirements during the conversion 11284 :type column_case: str 11285 :return: The function `annotation_format_to_table` is returning the name of the view created, 11286 which is stored in the variable `view_name`. 11287 """ 11288 11289 # Transcript annotation 11290 if column_rename: 11291 annotation_id = column_rename.get(annotation_id, annotation_id) 11292 11293 if column_clean: 11294 annotation_id = clean_annotation_field(annotation_id) 11295 11296 # Prefix 11297 prefix = self.get_explode_infos_prefix() 11298 if prefix: 11299 prefix = "INFO/" 11300 11301 # Variants table 11302 table_variants = self.get_table_variants() 11303 11304 # Header 11305 vcf_reader = self.get_header() 11306 11307 # Add columns 11308 added_columns = [] 11309 11310 # Explode HGVS field in column 11311 added_columns += self.explode_infos(fields=[annotation_field]) 11312 11313 if annotation_field in vcf_reader.infos: 11314 11315 # Extract ANN header 11316 ann_description = vcf_reader.infos[annotation_field].desc 11317 pattern = r"'(.+?)'" 11318 match = re.search(pattern, ann_description) 11319 if match: 11320 ann_header_match = match.group(1).split(" | ") 11321 ann_header = [] 11322 ann_header_desc = {} 11323 for i in range(len(ann_header_match)): 11324 ann_header_info = "".join( 11325 char for char in ann_header_match[i] if char.isalnum() 11326 ) 11327 ann_header.append(ann_header_info) 11328 ann_header_desc[ann_header_info] = ann_header_match[i] 11329 if not ann_header_desc: 11330 raise ValueError("Invalid header description format") 11331 else: 11332 raise ValueError("Invalid header description format") 11333 11334 # Create dataframe for keys column type 11335 dataframe_annotation_format = self.get_query_to_df( 11336 f""" 11337 WITH exploded_annotations AS ( 11338 SELECT 11339 UNNEST(STRING_SPLIT(ANN, ',')) AS annotation 11340 FROM {table_variants} 11341 ), 11342 split_annotations AS ( 11343 SELECT 11344 {", ".join([f"SPLIT_PART(annotation, '|', {i+1}) AS '{header}'" for i, header in enumerate(ann_header_desc.values())])}, 11345 FROM exploded_annotations 11346 ) 11347 SELECT * FROM split_annotations 11348 LIMIT 1000 11349 """ 11350 ) 11351 11352 # Init 11353 query_list_keys = [] 11354 key_i = 0 11355 11356 for key in dataframe_annotation_format.keys(): 11357 11358 # Key 11359 key_i += 1 11360 key_clean = key 11361 11362 # key rename 11363 if column_rename: 11364 key_clean = column_rename.get(key_clean, key_clean) 11365 11366 # key clean 11367 if column_clean: 11368 key_clean = clean_annotation_field(key_clean) 11369 11370 # Key case 11371 if column_case: 11372 if column_case.lower() in ["lower"]: 11373 key_clean = key_clean.lower() 11374 elif column_case.lower() in ["upper"]: 11375 key_clean = key_clean.upper() 11376 11377 # Detect column type 11378 column_type = detect_column_type(dataframe_annotation_format[key]) 11379 11380 # Append key to list 11381 query_list_keys.append( 11382 f""" NULLIF(SPLIT_PART(annotation, '|', {key_i}), '')::{column_type} AS '{prefix}{key_clean}' """ 11383 ) 11384 11385 # Create temporary table 11386 query_create_view = f""" 11387 CREATE VIEW {view_name} AS ( 11388 WITH exploded_annotations AS ( 11389 SELECT 11390 "#CHROM", 11391 POS, 11392 REF, 11393 ALT, 11394 INFO, 11395 UNNEST(STRING_SPLIT(ANN, ',')) AS annotation 11396 FROM {table_variants} 11397 ), 11398 split_annotations AS ( 11399 SELECT 11400 "#CHROM", 11401 POS, 11402 REF, 11403 ALT, 11404 INFO, 11405 {", ".join(query_list_keys)}, 11406 FROM exploded_annotations 11407 ) 11408 SELECT *, {annotation_id} AS 'transcript' FROM split_annotations 11409 ) 11410 """ 11411 log.debug(f"query_create_view: {query_create_view}") 11412 self.execute_query(query=query_create_view) 11413 11414 else: 11415 11416 # Return None 11417 view_name = None 11418 11419 return view_name, added_columns 11420 11421 def transcript_view_to_variants( 11422 self, 11423 transcripts_table: str = None, 11424 transcripts_column_id: str = None, 11425 transcripts_info_json: str = None, 11426 transcripts_info_field_json: str = None, 11427 transcripts_info_format: str = None, 11428 transcripts_info_field_format: str = None, 11429 param: dict = {}, 11430 ) -> bool: 11431 """ 11432 The `transcript_view_to_variants` function updates a variants table with information from 11433 transcripts in JSON format. 11434 11435 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11436 table containing the transcripts data. If this parameter is not provided, the function will 11437 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11438 :type transcripts_table: str 11439 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11440 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11441 identifier is used to match transcripts with variants in the database 11442 :type transcripts_column_id: str 11443 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11444 of the column in the variants table where the transcripts information will be stored in JSON 11445 format. This parameter allows you to define the column in the variants table that will hold the 11446 JSON-formatted information about transcripts 11447 :type transcripts_info_json: str 11448 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11449 specify the field in the VCF header that will contain information about transcripts in JSON 11450 format. This field will be added to the VCF header as an INFO field with the specified name 11451 :type transcripts_info_field_json: str 11452 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11453 format of the information about transcripts that will be stored in the variants table. This 11454 format can be used to define how the transcript information will be structured or displayed 11455 within the variants table 11456 :type transcripts_info_format: str 11457 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11458 specify the field in the VCF header that will contain information about transcripts in a 11459 specific format. This field will be added to the VCF header as an INFO field with the specified 11460 name 11461 :type transcripts_info_field_format: str 11462 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11463 that contains various configuration settings related to transcripts. It is used to provide 11464 default values for certain parameters if they are not explicitly provided when calling the 11465 method. The `param` dictionary can be passed as an argument 11466 :type param: dict 11467 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11468 if the operation is successful and `False` if certain conditions are not met. 11469 """ 11470 11471 msg_info_prefix = "Start transcripts view to variants annotations" 11472 11473 log.debug(f"{msg_info_prefix}...") 11474 11475 # Default 11476 transcripts_table_default = "transcripts" 11477 transcripts_column_id_default = "transcript" 11478 transcripts_info_json_default = None 11479 transcripts_info_format_default = None 11480 transcripts_info_field_json_default = None 11481 transcripts_info_field_format_default = None 11482 11483 # Param 11484 if not param: 11485 param = self.get_param() 11486 11487 # Transcripts table 11488 if transcripts_table is None: 11489 transcripts_table = param.get("transcripts", {}).get( 11490 "table", transcripts_table_default 11491 ) 11492 11493 # Transcripts column ID 11494 if transcripts_column_id is None: 11495 transcripts_column_id = param.get("transcripts", {}).get( 11496 "column_id", transcripts_column_id_default 11497 ) 11498 11499 # Transcripts info json 11500 if transcripts_info_json is None: 11501 transcripts_info_json = param.get("transcripts", {}).get( 11502 "transcripts_info_json", transcripts_info_json_default 11503 ) 11504 11505 # Transcripts info field JSON 11506 if transcripts_info_field_json is None: 11507 transcripts_info_field_json = param.get("transcripts", {}).get( 11508 "transcripts_info_field_json", transcripts_info_field_json_default 11509 ) 11510 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11511 # transcripts_info_json = transcripts_info_field_json 11512 11513 # Transcripts info format 11514 if transcripts_info_format is None: 11515 transcripts_info_format = param.get("transcripts", {}).get( 11516 "transcripts_info_format", transcripts_info_format_default 11517 ) 11518 11519 # Transcripts info field FORMAT 11520 if transcripts_info_field_format is None: 11521 transcripts_info_field_format = param.get("transcripts", {}).get( 11522 "transcripts_info_field_format", transcripts_info_field_format_default 11523 ) 11524 # if ( 11525 # transcripts_info_field_format is not None 11526 # and transcripts_info_format is None 11527 # ): 11528 # transcripts_info_format = transcripts_info_field_format 11529 11530 # Variants table 11531 table_variants = self.get_table_variants() 11532 11533 # Check info columns param 11534 if ( 11535 transcripts_info_json is None 11536 and transcripts_info_field_json is None 11537 and transcripts_info_format is None 11538 and transcripts_info_field_format is None 11539 ): 11540 return False 11541 11542 # Transcripts infos columns 11543 query_transcripts_infos_columns = f""" 11544 SELECT * 11545 FROM ( 11546 DESCRIBE SELECT * FROM {transcripts_table} 11547 ) 11548 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11549 """ 11550 transcripts_infos_columns = list( 11551 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11552 ) 11553 11554 # View results 11555 clause_select = [] 11556 clause_to_json = [] 11557 clause_to_format = [] 11558 for field in transcripts_infos_columns: 11559 # Do not consider INFO field for export into fields 11560 if field not in ["INFO"]: 11561 clause_select.append( 11562 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11563 ) 11564 clause_to_json.append(f""" '{field}': "{field}" """) 11565 clause_to_format.append(f""" "{field}" """) 11566 11567 # Update 11568 update_set_json = [] 11569 update_set_format = [] 11570 11571 # VCF header 11572 vcf_reader = self.get_header() 11573 11574 # Transcripts to info column in JSON 11575 if transcripts_info_json: 11576 11577 # Create column on variants table 11578 self.add_column( 11579 table_name=table_variants, 11580 column_name=transcripts_info_json, 11581 column_type="JSON", 11582 default_value=None, 11583 drop=False, 11584 ) 11585 11586 # Add header 11587 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11588 transcripts_info_json, 11589 ".", 11590 "String", 11591 "Transcripts in JSON format", 11592 "unknwon", 11593 "unknwon", 11594 self.code_type_map["String"], 11595 ) 11596 11597 # Add to update 11598 update_set_json.append( 11599 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11600 ) 11601 11602 # Transcripts to info field in JSON 11603 if transcripts_info_field_json: 11604 11605 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11606 11607 # Add to update 11608 update_set_json.append( 11609 f""" 11610 INFO = concat( 11611 CASE 11612 WHEN INFO NOT IN ('', '.') 11613 THEN INFO 11614 ELSE '' 11615 END, 11616 CASE 11617 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11618 THEN concat( 11619 ';{transcripts_info_field_json}=', 11620 t.{transcripts_info_json} 11621 ) 11622 ELSE '' 11623 END 11624 ) 11625 """ 11626 ) 11627 11628 # Add header 11629 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11630 transcripts_info_field_json, 11631 ".", 11632 "String", 11633 "Transcripts in JSON format", 11634 "unknwon", 11635 "unknwon", 11636 self.code_type_map["String"], 11637 ) 11638 11639 if update_set_json: 11640 11641 # Update query 11642 query_update = f""" 11643 UPDATE {table_variants} 11644 SET {", ".join(update_set_json)} 11645 FROM 11646 ( 11647 SELECT 11648 "#CHROM", POS, REF, ALT, 11649 concat( 11650 '{{', 11651 string_agg( 11652 '"' || "{transcripts_column_id}" || '":' || 11653 to_json(json_output) 11654 ), 11655 '}}' 11656 )::JSON AS {transcripts_info_json} 11657 FROM 11658 ( 11659 SELECT 11660 "#CHROM", POS, REF, ALT, 11661 "{transcripts_column_id}", 11662 to_json( 11663 {{{",".join(clause_to_json)}}} 11664 )::JSON AS json_output 11665 FROM 11666 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11667 WHERE "{transcripts_column_id}" IS NOT NULL 11668 ) 11669 GROUP BY "#CHROM", POS, REF, ALT 11670 ) AS t 11671 WHERE {table_variants}."#CHROM" = t."#CHROM" 11672 AND {table_variants}."POS" = t."POS" 11673 AND {table_variants}."REF" = t."REF" 11674 AND {table_variants}."ALT" = t."ALT" 11675 """ 11676 11677 self.execute_query(query=query_update) 11678 11679 # Transcripts to info column in FORMAT 11680 if transcripts_info_format: 11681 11682 # Create column on variants table 11683 self.add_column( 11684 table_name=table_variants, 11685 column_name=transcripts_info_format, 11686 column_type="VARCHAR", 11687 default_value=None, 11688 drop=False, 11689 ) 11690 11691 # Add header 11692 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11693 transcripts_info_format, 11694 ".", 11695 "String", 11696 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11697 "unknwon", 11698 "unknwon", 11699 self.code_type_map["String"], 11700 ) 11701 11702 # Add to update 11703 update_set_format.append( 11704 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11705 ) 11706 11707 else: 11708 11709 # Set variable for internal queries 11710 transcripts_info_format = "transcripts_info_format" 11711 11712 # Transcripts to info field in JSON 11713 if transcripts_info_field_format: 11714 11715 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11716 11717 # Add to update 11718 update_set_format.append( 11719 f""" 11720 INFO = concat( 11721 CASE 11722 WHEN INFO NOT IN ('', '.') 11723 THEN INFO 11724 ELSE '' 11725 END, 11726 CASE 11727 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11728 THEN concat( 11729 ';{transcripts_info_field_format}=', 11730 t.{transcripts_info_format} 11731 ) 11732 ELSE '' 11733 END 11734 ) 11735 """ 11736 ) 11737 11738 # Add header 11739 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11740 transcripts_info_field_format, 11741 ".", 11742 "String", 11743 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11744 "unknwon", 11745 "unknwon", 11746 self.code_type_map["String"], 11747 ) 11748 11749 if update_set_format: 11750 11751 # Update query 11752 query_update = f""" 11753 UPDATE {table_variants} 11754 SET {", ".join(update_set_format)} 11755 FROM 11756 ( 11757 SELECT 11758 "#CHROM", POS, REF, ALT, 11759 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11760 FROM 11761 ( 11762 SELECT 11763 "#CHROM", POS, REF, ALT, 11764 "{transcripts_column_id}", 11765 concat( 11766 "{transcripts_column_id}", 11767 '|', 11768 {", '|', ".join(clause_to_format)} 11769 ) AS {transcripts_info_format} 11770 FROM 11771 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11772 ) 11773 GROUP BY "#CHROM", POS, REF, ALT 11774 ) AS t 11775 WHERE {table_variants}."#CHROM" = t."#CHROM" 11776 AND {table_variants}."POS" = t."POS" 11777 AND {table_variants}."REF" = t."REF" 11778 AND {table_variants}."ALT" = t."ALT" 11779 """ 11780 11781 self.execute_query(query=query_update) 11782 11783 return True 11784 11785 def rename_info_fields( 11786 self, fields_to_rename: dict = None, table: str = None 11787 ) -> dict: 11788 """ 11789 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11790 corresponding INFO fields in the variants table. 11791 11792 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11793 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11794 represent the original field names that need to be renamed, and the corresponding values 11795 represent the new names to which the fields should be 11796 :type fields_to_rename: dict 11797 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11798 the table in which the variants data is stored. This table contains information about genetic 11799 variants, and the function updates the corresponding INFO fields in this table when renaming 11800 specified fields in the VCF file header 11801 :type table: str 11802 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11803 the original field names as keys and their corresponding new names (or None if the field was 11804 removed) as values after renaming or removing specified fields in a VCF file header and updating 11805 corresponding INFO fields in the variants table. 11806 """ 11807 11808 # Init 11809 fields_renamed = {} 11810 config = self.get_config() 11811 access = config.get("access") 11812 11813 if table is None: 11814 table = self.get_table_variants() 11815 11816 # regexp replace fonction 11817 regex_replace_dict = {} 11818 regex_replace_nb = 0 11819 regex_replace_partition = 125 11820 regex_replace = "concat(INFO, ';')" # Add ';' to reduce regexp comlexity 11821 11822 if fields_to_rename is not None and access not in ["RO"]: 11823 11824 log.info("Rename or remove fields...") 11825 11826 # Header 11827 header = self.get_header() 11828 11829 for field_to_rename, field_renamed in fields_to_rename.items(): 11830 11831 if field_to_rename in header.infos: 11832 11833 # Rename header 11834 if field_renamed is not None: 11835 header.infos[field_renamed] = vcf.parser._Info( 11836 field_renamed, 11837 header.infos[field_to_rename].num, 11838 header.infos[field_to_rename].type, 11839 header.infos[field_to_rename].desc, 11840 header.infos[field_to_rename].source, 11841 header.infos[field_to_rename].version, 11842 header.infos[field_to_rename].type_code, 11843 ) 11844 del header.infos[field_to_rename] 11845 11846 # Rename INFO patterns 11847 field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;" 11848 if field_renamed is not None: 11849 field_renamed_pattern = rf"\1{field_renamed}\3;" 11850 else: 11851 field_renamed_pattern = r"\1" 11852 11853 # regexp replace 11854 regex_replace_nb += 1 11855 regex_replace_key = math.floor( 11856 regex_replace_nb / regex_replace_partition 11857 ) 11858 if (regex_replace_nb % regex_replace_partition) == 0: 11859 regex_replace = "concat(INFO, ';')" 11860 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11861 regex_replace_dict[regex_replace_key] = regex_replace 11862 11863 # Return 11864 fields_renamed[field_to_rename] = field_renamed 11865 11866 # Log 11867 if field_renamed is not None: 11868 log.info( 11869 f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'" 11870 ) 11871 else: 11872 log.info( 11873 f"Rename or remove fields - field '{field_to_rename}' removed" 11874 ) 11875 11876 else: 11877 11878 log.warning( 11879 f"Rename or remove fields - field '{field_to_rename}' not in header" 11880 ) 11881 11882 # Rename INFO 11883 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11884 log.info( 11885 f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..." 11886 ) 11887 query = f""" 11888 UPDATE {table} 11889 SET 11890 INFO = regexp_replace({regex_replace}, ';$', '') 11891 """ 11892 log.debug(f"query={query}") 11893 self.execute_query(query=query) 11894 11895 return fields_renamed 11896 11897 def calculation_rename_info_fields( 11898 self, 11899 fields_to_rename: dict = None, 11900 table: str = None, 11901 operation_name: str = "RENAME_INFO_FIELDS", 11902 ) -> None: 11903 """ 11904 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11905 fields to rename and table if provided, and then calls another function to rename the fields. 11906 11907 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11908 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11909 the key and the new field name as the value 11910 :type fields_to_rename: dict 11911 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11912 specify the name of the table for which the fields are to be renamed. It is a string type 11913 parameter 11914 :type table: str 11915 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11916 method is a string that specifies the name of the operation being performed. In this context, it 11917 is used as a default value for the operation name if not explicitly provided when calling the 11918 function, defaults to RENAME_INFO_FIELDS 11919 :type operation_name: str (optional) 11920 """ 11921 11922 # Param 11923 param = self.get_param() 11924 11925 # Get param fields to rename 11926 param_fields_to_rename = ( 11927 param.get("calculation", {}) 11928 .get("calculations", {}) 11929 .get(operation_name, {}) 11930 .get("fields_to_rename", None) 11931 ) 11932 11933 # Get param table 11934 param_table = ( 11935 param.get("calculation", {}) 11936 .get("calculations", {}) 11937 .get(operation_name, {}) 11938 .get("table", None) 11939 ) 11940 11941 # Init fields_to_rename 11942 if fields_to_rename is None: 11943 fields_to_rename = param_fields_to_rename 11944 11945 # Init table 11946 if table is None: 11947 table = param_table 11948 11949 renamed_fields = self.rename_info_fields( 11950 fields_to_rename=fields_to_rename, table=table 11951 ) 11952 11953 log.debug(f"renamed_fields:{renamed_fields}") 11954 11955 def create_annotations_view( 11956 self, 11957 table: str = None, 11958 view: str = None, 11959 view_type: str = None, 11960 fields: list = None, 11961 prefix: str = "", 11962 drop_view: bool = False, 11963 fields_to_rename: dict = None, 11964 limit: int = None, 11965 ) -> str: 11966 """ 11967 The `create_annotations_view` function creates a SQL view from fields in a VCF INFO column. 11968 11969 :param table: The `table` parameter in the `create_annotations_view` function is used to specify 11970 the name of the table from which the fields are to be extracted. This table contains the 11971 variants data, and the function creates a view based on the fields in the INFO column of this 11972 table 11973 :type table: str 11974 :param view: The `view` parameter in the `create_annotations_view` function is used to specify 11975 the name of the view that will be created based on the fields in the VCF INFO column. This view 11976 will contain the extracted fields from the INFO column in a structured format for further 11977 processing or analysis 11978 :type view: str 11979 :param view_type: The `view_type` parameter in the `create_annotations_view` function is used to 11980 specify the type of view that will be created. It can be either a `VIEW` or a `TABLE`, and the 11981 function will create the view based on the specified type 11982 :type view_type: str 11983 :param fields: The `fields` parameter in the `create_annotations_view` function is a list that 11984 contains the names of the fields to be extracted from the INFO column in the VCF file. These 11985 fields will be used to create the view with the specified columns and data extracted from the 11986 INFO column 11987 :type fields: list 11988 :param prefix: The `prefix` parameter in the `create_annotations_view` function is used to 11989 specify a prefix that will be added to the field names in the view. This prefix helps in 11990 distinguishing the fields extracted from the INFO column in the view 11991 :type prefix: str 11992 :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean 11993 flag that determines whether to drop the existing view with the same name before creating a new 11994 view. If set to `True`, the function will drop the existing view before creating a new view with 11995 the specified name 11996 :type drop_view: bool 11997 :param fields_to_rename: The `fields_to_rename` parameter in the `create_annotations_view` 11998 function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The 11999 keys in the dictionary represent the original field names that need to be renamed, and the 12000 corresponding values represent the new names to which the fields should be 12001 :type fields_to_rename: dict 12002 :param limit: The `limit` parameter in the `create_annotations_view` function is an integer that 12003 specifies the maximum number of rows to be included in the view. If provided, the function will 12004 limit the number of rows in the view to the specified value 12005 :type limit: int 12006 :return: The `create_annotations_view` function returns the name of the view that is created 12007 based on the fields extracted from the INFO column in the VCF file. This view contains the 12008 extracted fields in a structured format for further processing or analysis 12009 """ 12010 12011 # Create a sql view from fields in VCF INFO column, with each column is a field present in the VCF header (with a specific type from VCF header) and extracted from INFO column (with a regexp like in rename_info_fields), and each row is a variant. 12012 12013 # Get table 12014 if table is None: 12015 table = self.get_table_variants() 12016 12017 # Get view 12018 if view is None: 12019 view = f"{table}_annotations" 12020 12021 # Get view type 12022 if view_type is None: 12023 view_type = "VIEW" 12024 12025 # Check view type value 12026 if view_type.upper() not in ["VIEW", "TABLE"]: 12027 raise ValueError( 12028 f"Invalid view type value: {view_type}. Either 'VIEW' or 'TABLE'" 12029 ) 12030 12031 # Get header 12032 header = self.get_header() 12033 12034 # Get fields 12035 if fields is None: 12036 fields = list(header.infos.keys()) 12037 12038 # Get fields to rename 12039 if fields_to_rename is None: 12040 fields_to_rename = {} 12041 12042 log.info( 12043 f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields" 12044 ) 12045 12046 # Describe table 12047 table_describe_query = f""" 12048 DESCRIBE {table} 12049 """ 12050 table_describe = self.get_query_to_df(query=table_describe_query) 12051 12052 # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header 12053 fields_columns = [] 12054 fields_needed = ["#CHROM", "POS", "REF", "ALT"] 12055 field_sql_type_list = False 12056 for field in fields: 12057 12058 # Rename field 12059 field_to_rename = fields_to_rename.get(field, field) 12060 12061 # Check field type 12062 12063 # Needed fields 12064 if field in fields_needed: 12065 continue 12066 12067 # Fields in table 12068 elif field in list(table_describe.get("column_name")): 12069 fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """) 12070 12071 # Fields in header 12072 elif field in header.infos: 12073 12074 # Field info 12075 field_infos = header.infos.get(field, None) 12076 12077 # Field SQL type 12078 field_sql_type = code_type_map_to_sql.get(field_infos.type, "VARCHAR") 12079 12080 # Column is a list 12081 if field_infos.num != 1: 12082 field_sql_type_list = True 12083 12084 # Colonne is a flag 12085 if field_infos.type == "Flag": 12086 field_pattern = rf"(^|;)({field})([^;]*)?" 12087 fields_columns.append( 12088 f""" regexp_matches("INFO", '{field_pattern}')::BOOLEAN AS '{prefix}{field_to_rename}' """ 12089 ) 12090 12091 # Colonne with a type 12092 else: 12093 12094 # Field pattern 12095 field_pattern = rf"(^|;)({field})=([^;]*)?" 12096 12097 # Field is a list 12098 if field_sql_type_list: 12099 fields_columns.append( 12100 f""" CAST(list_transform(string_split(NULLIF(regexp_extract("INFO", '{field_pattern}', 3), ''), ','), x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END) AS {field_sql_type}[]) AS '{prefix}{field_to_rename}' """ 12101 ) 12102 12103 # Field is a unique value 12104 else: 12105 fields_columns.append( 12106 f""" NULLIF(regexp_replace(regexp_extract("INFO", '{field_pattern}', 3), '^\\.$', ''), '')::{field_sql_type} AS '{prefix}{field_to_rename}' """ 12107 ) 12108 12109 else: 12110 fields_columns.append(f""" null AS '{prefix}{field_to_rename}' """) 12111 msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL" 12112 log.warning(msg=msg_err) 12113 12114 # Limit 12115 limit_clause = "" 12116 if limit is not None: 12117 limit_clause = f" LIMIT {limit} " 12118 12119 # Query select 12120 query_select = f""" 12121 SELECT 12122 {', '.join([f'"{field}"' for field in fields_needed])}, {", ".join(fields_columns)} 12123 FROM 12124 {table} 12125 {limit_clause} 12126 """ 12127 12128 # Drop if any 12129 if drop_view: 12130 log.debug(f"Drop view: {view}") 12131 query_create_view = f""" 12132 DROP {view_type} IF EXISTS {view} 12133 """ 12134 self.execute_query(query=query_create_view) 12135 log.debug(f"View dropped: {view}") 12136 12137 # Create view 12138 log.debug(f"Create view: {view}") 12139 query_create_view = f""" 12140 CREATE {view_type} IF NOT EXISTS {view} AS {query_select} 12141 """ 12142 # log.debug(f"query_create_view:{query_create_view}") 12143 self.execute_query(query=query_create_view) 12144 log.debug(f"View created: {view}") 12145 12146 return view
37class Variants: 38 39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data() 86 87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples 105 106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples 113 114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True) 124 125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "") 150 151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config 164 165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param 175 176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = [] 204 205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False) 213 214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config 255 256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict 280 281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db 310 311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 356 357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None 383 384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None 485 486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df 527 528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None 570 571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats 792 793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file 815 816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None 917 918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input 924 925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format 941 942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed 959 960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output 967 968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format 986 987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config 993 994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param 1000 1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db 1007 1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix 1014 1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants 1042 1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 ) 1054 1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory") 1062 1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn 1070 1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close() 1077 1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required 1097 1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list 1111 1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0 1130 1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return "" 1141 1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return [] 1152 1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list) 1163 1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list 1222 1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False 1241 1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False) 1250 1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format 1262 1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1316 1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes() 1512 1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False) 1522 1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(rf"^{field}$") 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return [] 1622 1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix 1641 1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column 1713 1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed 1771 1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns 1988 1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index) 2015 2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index) 2041 2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list 2056 2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f) 2075 2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None 2087 2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None, 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 ) 2304 2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns 2336 2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 ) 2351 2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name 2446 2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join( 2519 [f""" "{sample}" """ for sample in list_samples] 2520 ) 2521 else: 2522 samples_fields = "" 2523 log.debug(f"samples_fields: {samples_fields}") 2524 else: 2525 samples_fields = "" 2526 2527 # Where clause 2528 if where_clause is None: 2529 where_clause = "" 2530 2531 # Variants 2532 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2533 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2534 log.debug(f"sql_query_select={sql_query_select}") 2535 2536 return self.export_output( 2537 output_file=vcf_file, 2538 output_header=None, 2539 export_header=True, 2540 query=sql_query_select, 2541 parquet_partitions=None, 2542 chunk_size=config.get("chunk_size", None), 2543 threads=threads, 2544 sort=True, 2545 index=index, 2546 order_by=None, 2547 ) 2548 2549 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2550 """ 2551 It takes a list of commands and runs them in parallel using the number of threads specified 2552 2553 :param commands: A list of commands to run 2554 :param threads: The number of threads to use, defaults to 1 (optional) 2555 """ 2556 2557 run_parallel_commands(commands, threads) 2558 2559 def get_threads(self, default: int = 1) -> int: 2560 """ 2561 This function returns the number of threads to use for a job, with a default value of 1 if not 2562 specified. 2563 2564 :param default: The `default` parameter in the `get_threads` method is used to specify the 2565 default number of threads to use if no specific value is provided. If no value is provided for 2566 the `threads` parameter in the configuration or input parameters, the `default` value will be 2567 used, defaults to 1 2568 :type default: int (optional) 2569 :return: the number of threads to use for the current job. 2570 """ 2571 2572 # Config 2573 config = self.get_config() 2574 2575 # Param 2576 param = self.get_param() 2577 2578 # Input threads 2579 input_thread = param.get("threads", config.get("threads", None)) 2580 2581 # Check threads 2582 if not input_thread: 2583 threads = default 2584 elif int(input_thread) <= 0: 2585 threads = os.cpu_count() 2586 else: 2587 threads = int(input_thread) 2588 return threads 2589 2590 def get_memory(self, default: str = None) -> str: 2591 """ 2592 This function retrieves the memory value from parameters or configuration with a default value 2593 if not found. 2594 2595 :param default: The `get_memory` function takes in a default value as a string parameter. This 2596 default value is used as a fallback in case the `memory` parameter is not provided in the 2597 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2598 the function 2599 :type default: str 2600 :return: The `get_memory` function returns a string value representing the memory parameter. If 2601 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2602 return the default value provided as an argument to the function. 2603 """ 2604 2605 # Config 2606 config = self.get_config() 2607 2608 # Param 2609 param = self.get_param() 2610 2611 # Input threads 2612 input_memory = param.get("memory", config.get("memory", None)) 2613 2614 # Check threads 2615 if input_memory: 2616 memory = input_memory 2617 else: 2618 memory = default 2619 2620 return memory 2621 2622 def update_from_vcf(self, vcf_file: str) -> None: 2623 """ 2624 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2625 2626 :param vcf_file: the path to the VCF file 2627 """ 2628 2629 connexion_format = self.get_connexion_format() 2630 2631 if connexion_format in ["duckdb"]: 2632 self.update_from_vcf_duckdb(vcf_file) 2633 elif connexion_format in ["sqlite"]: 2634 self.update_from_vcf_sqlite(vcf_file) 2635 2636 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2637 """ 2638 It takes a VCF file and updates the INFO column of the variants table in the database with the 2639 INFO column of the VCF file 2640 2641 :param vcf_file: the path to the VCF file 2642 """ 2643 2644 # varaints table 2645 table_variants = self.get_table_variants() 2646 2647 # Loading VCF into temporaire table 2648 skip = self.get_header_length(file=vcf_file) 2649 vcf_df = pd.read_csv( 2650 vcf_file, 2651 sep="\t", 2652 engine="c", 2653 skiprows=skip, 2654 header=0, 2655 low_memory=False, 2656 ) 2657 sql_query_update = f""" 2658 UPDATE {table_variants} as table_variants 2659 SET INFO = concat( 2660 CASE 2661 WHEN INFO NOT IN ('', '.') 2662 THEN INFO 2663 ELSE '' 2664 END, 2665 ( 2666 SELECT 2667 concat( 2668 CASE 2669 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2670 THEN ';' 2671 ELSE '' 2672 END 2673 , 2674 CASE 2675 WHEN table_parquet.INFO NOT IN ('','.') 2676 THEN table_parquet.INFO 2677 ELSE '' 2678 END 2679 ) 2680 FROM vcf_df as table_parquet 2681 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2682 AND table_parquet.\"POS\" = table_variants.\"POS\" 2683 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2684 AND table_parquet.\"REF\" = table_variants.\"REF\" 2685 AND table_parquet.INFO NOT IN ('','.') 2686 ) 2687 ) 2688 ; 2689 """ 2690 self.conn.execute(sql_query_update) 2691 2692 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2693 """ 2694 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2695 table, then updates the INFO column of the variants table with the INFO column of the temporary 2696 table 2697 2698 :param vcf_file: The path to the VCF file you want to update the database with 2699 """ 2700 2701 # Create a temporary table for the VCF 2702 table_vcf = "tmp_vcf" 2703 sql_create = ( 2704 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2705 ) 2706 self.conn.execute(sql_create) 2707 2708 # Loading VCF into temporaire table 2709 vcf_df = pd.read_csv( 2710 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2711 ) 2712 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2713 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2714 2715 # Update table 'variants' with VCF data 2716 # warning: CONCAT as || operator 2717 sql_query_update = f""" 2718 UPDATE variants as table_variants 2719 SET INFO = CASE 2720 WHEN INFO NOT IN ('', '.') 2721 THEN INFO 2722 ELSE '' 2723 END || 2724 ( 2725 SELECT 2726 CASE 2727 WHEN table_variants.INFO NOT IN ('','.') 2728 AND table_vcf.INFO NOT IN ('','.') 2729 THEN ';' 2730 ELSE '' 2731 END || 2732 CASE 2733 WHEN table_vcf.INFO NOT IN ('','.') 2734 THEN table_vcf.INFO 2735 ELSE '' 2736 END 2737 FROM {table_vcf} as table_vcf 2738 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2739 AND table_vcf.\"POS\" = table_variants.\"POS\" 2740 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2741 AND table_vcf.\"REF\" = table_variants.\"REF\" 2742 ) 2743 """ 2744 self.conn.execute(sql_query_update) 2745 2746 # Drop temporary table 2747 sql_drop = f"DROP TABLE {table_vcf}" 2748 self.conn.execute(sql_drop) 2749 2750 def drop_variants_table(self) -> None: 2751 """ 2752 > This function drops the variants table 2753 """ 2754 2755 table_variants = self.get_table_variants() 2756 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2757 self.conn.execute(sql_table_variants) 2758 2759 def set_variant_id( 2760 self, variant_id_column: str = "variant_id", force: bool = None 2761 ) -> str: 2762 """ 2763 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2764 `#CHROM`, `POS`, `REF`, and `ALT` columns 2765 2766 :param variant_id_column: The name of the column to be created in the variants table, defaults 2767 to variant_id 2768 :type variant_id_column: str (optional) 2769 :param force: If True, the variant_id column will be created even if it already exists 2770 :type force: bool 2771 :return: The name of the column that contains the variant_id 2772 """ 2773 2774 # Assembly 2775 assembly = self.get_param().get( 2776 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2777 ) 2778 2779 # INFO/Tag prefix 2780 prefix = self.get_explode_infos_prefix() 2781 2782 # Explode INFO/SVTYPE 2783 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2784 2785 # variants table 2786 table_variants = self.get_table_variants() 2787 2788 # variant_id column 2789 if not variant_id_column: 2790 variant_id_column = "variant_id" 2791 2792 # Creta variant_id column 2793 if "variant_id" not in self.get_extra_infos() or force: 2794 2795 # Create column 2796 self.add_column( 2797 table_name=table_variants, 2798 column_name=variant_id_column, 2799 column_type="UBIGINT", 2800 default_value="0", 2801 ) 2802 2803 # Update column 2804 self.conn.execute( 2805 f""" 2806 UPDATE {table_variants} 2807 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2808 """ 2809 ) 2810 2811 # Remove added columns 2812 for added_column in added_columns: 2813 self.drop_column(column=added_column) 2814 2815 # return variant_id column name 2816 return variant_id_column 2817 2818 def get_variant_id_column( 2819 self, variant_id_column: str = "variant_id", force: bool = None 2820 ) -> str: 2821 """ 2822 This function returns the variant_id column name 2823 2824 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2825 defaults to variant_id 2826 :type variant_id_column: str (optional) 2827 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2828 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2829 if it is not already set, or if it is set 2830 :type force: bool 2831 :return: The variant_id column name. 2832 """ 2833 2834 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2835 2836 ### 2837 # Annotation 2838 ### 2839 2840 def scan_databases( 2841 self, 2842 database_formats: list = ["parquet"], 2843 database_releases: list = ["current"], 2844 ) -> dict: 2845 """ 2846 The function `scan_databases` scans for available databases based on specified formats and 2847 releases. 2848 2849 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2850 of the databases to be scanned. In this case, the accepted format is "parquet" 2851 :type database_formats: list ["parquet"] 2852 :param database_releases: The `database_releases` parameter is a list that specifies the 2853 releases of the databases to be scanned. In the provided function, the default value for 2854 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2855 databases that are in the "current" 2856 :type database_releases: list 2857 :return: The function `scan_databases` returns a dictionary containing information about 2858 databases that match the specified formats and releases. 2859 """ 2860 2861 # Config 2862 config = self.get_config() 2863 2864 # Param 2865 param = self.get_param() 2866 2867 # Param - Assembly 2868 assembly = param.get("assembly", config.get("assembly", None)) 2869 if not assembly: 2870 assembly = DEFAULT_ASSEMBLY 2871 log.warning(f"Default assembly '{assembly}'") 2872 2873 # Scan for availabled databases 2874 log.info( 2875 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2876 ) 2877 databases_infos_dict = databases_infos( 2878 database_folder_releases=database_releases, 2879 database_formats=database_formats, 2880 assembly=assembly, 2881 config=config, 2882 ) 2883 log.info( 2884 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2885 ) 2886 2887 return databases_infos_dict 2888 2889 def annotation(self) -> None: 2890 """ 2891 It annotates the VCF file with the annotations specified in the config file. 2892 """ 2893 2894 # Config 2895 config = self.get_config() 2896 2897 # Param 2898 param = self.get_param() 2899 2900 # Param - Assembly 2901 assembly = param.get("assembly", config.get("assembly", None)) 2902 if not assembly: 2903 assembly = DEFAULT_ASSEMBLY 2904 log.warning(f"Default assembly '{assembly}'") 2905 2906 # annotations databases folders 2907 annotations_databases = set( 2908 config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("parquet", ["~/howard/databases/parquet/current"]) 2914 + config.get("folders", {}) 2915 .get("databases", {}) 2916 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2917 ) 2918 2919 # Get param annotations 2920 if param.get("annotations", None) and isinstance( 2921 param.get("annotations", None), str 2922 ): 2923 log.debug(param.get("annotations", None)) 2924 param_annotation_list = param.get("annotations").split(",") 2925 else: 2926 param_annotation_list = [] 2927 2928 # Each tools param 2929 if param.get("annotation_parquet", None) != None: 2930 log.debug( 2931 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2932 ) 2933 if isinstance(param.get("annotation_parquet", None), list): 2934 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2935 else: 2936 param_annotation_list.append(param.get("annotation_parquet")) 2937 if param.get("annotation_snpsift", None) != None: 2938 if isinstance(param.get("annotation_snpsift", None), list): 2939 param_annotation_list.append( 2940 "snpsift:" 2941 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2942 ) 2943 else: 2944 param_annotation_list.append( 2945 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2946 ) 2947 if param.get("annotation_snpeff", None) != None: 2948 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2949 if param.get("annotation_bcftools", None) != None: 2950 if isinstance(param.get("annotation_bcftools", None), list): 2951 param_annotation_list.append( 2952 "bcftools:" 2953 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2954 ) 2955 else: 2956 param_annotation_list.append( 2957 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2958 ) 2959 if param.get("annotation_annovar", None) != None: 2960 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2961 if param.get("annotation_exomiser", None) != None: 2962 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2963 if param.get("annotation_splice", None) != None: 2964 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2965 2966 # Merge param annotations list 2967 param["annotations"] = ",".join(param_annotation_list) 2968 2969 # debug 2970 log.debug(f"param_annotations={param['annotations']}") 2971 2972 if param.get("annotations"): 2973 2974 # Log 2975 # log.info("Annotations - Check annotation parameters") 2976 2977 if not "annotation" in param: 2978 param["annotation"] = {} 2979 2980 # List of annotations parameters 2981 annotations_list_input = {} 2982 if isinstance(param.get("annotations", None), str): 2983 annotation_file_list = [ 2984 value for value in param.get("annotations", "").split(",") 2985 ] 2986 for annotation_file in annotation_file_list: 2987 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2988 else: 2989 annotations_list_input = param.get("annotations", {}) 2990 2991 log.info(f"Quick Annotations:") 2992 for annotation_key in list(annotations_list_input.keys()): 2993 log.info(f" {annotation_key}") 2994 2995 # List of annotations and associated fields 2996 annotations_list = {} 2997 2998 for annotation_file in annotations_list_input: 2999 3000 # Explode annotations if ALL 3001 if ( 3002 annotation_file.upper() == "ALL" 3003 or annotation_file.upper().startswith("ALL:") 3004 ): 3005 3006 # check ALL parameters (formats, releases) 3007 annotation_file_split = annotation_file.split(":") 3008 database_formats = "parquet" 3009 database_releases = "current" 3010 for annotation_file_option in annotation_file_split[1:]: 3011 database_all_options_split = annotation_file_option.split("=") 3012 if database_all_options_split[0] == "format": 3013 database_formats = database_all_options_split[1].split("+") 3014 if database_all_options_split[0] == "release": 3015 database_releases = database_all_options_split[1].split("+") 3016 3017 # Scan for availabled databases 3018 databases_infos_dict = self.scan_databases( 3019 database_formats=database_formats, 3020 database_releases=database_releases, 3021 ) 3022 3023 # Add found databases in annotation parameters 3024 for database_infos in databases_infos_dict.keys(): 3025 annotations_list[database_infos] = {"INFO": None} 3026 3027 else: 3028 annotations_list[annotation_file] = annotations_list_input[ 3029 annotation_file 3030 ] 3031 3032 # Check each databases 3033 if len(annotations_list): 3034 3035 log.info( 3036 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3037 ) 3038 3039 for annotation_file in annotations_list: 3040 3041 # Init 3042 annotations = annotations_list.get(annotation_file, None) 3043 3044 # Annotation snpEff 3045 if annotation_file.startswith("snpeff"): 3046 3047 log.debug(f"Quick Annotation snpEff") 3048 3049 if "snpeff" not in param["annotation"]: 3050 param["annotation"]["snpeff"] = {} 3051 3052 if "options" not in param["annotation"]["snpeff"]: 3053 param["annotation"]["snpeff"]["options"] = "" 3054 3055 # snpEff options in annotations 3056 param["annotation"]["snpeff"]["options"] = "".join( 3057 annotation_file.split(":")[1:] 3058 ) 3059 3060 # Annotation Annovar 3061 elif annotation_file.startswith("annovar"): 3062 3063 log.debug(f"Quick Annotation Annovar") 3064 3065 if "annovar" not in param["annotation"]: 3066 param["annotation"]["annovar"] = {} 3067 3068 if "annotations" not in param["annotation"]["annovar"]: 3069 param["annotation"]["annovar"]["annotations"] = {} 3070 3071 # Options 3072 annotation_file_split = annotation_file.split(":") 3073 for annotation_file_annotation in annotation_file_split[1:]: 3074 if annotation_file_annotation: 3075 param["annotation"]["annovar"]["annotations"][ 3076 annotation_file_annotation 3077 ] = annotations 3078 3079 # Annotation Exomiser 3080 elif annotation_file.startswith("exomiser"): 3081 3082 log.debug(f"Quick Annotation Exomiser") 3083 3084 param["annotation"]["exomiser"] = params_string_to_dict( 3085 annotation_file 3086 ) 3087 3088 # Annotation Splice 3089 elif annotation_file.startswith("splice"): 3090 3091 log.debug(f"Quick Annotation Splice") 3092 3093 param["annotation"]["splice"] = params_string_to_dict( 3094 annotation_file 3095 ) 3096 3097 # Annotation Parquet or BCFTOOLS 3098 else: 3099 3100 # Tools detection 3101 if annotation_file.startswith("bcftools:"): 3102 annotation_tool_initial = "bcftools" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("snpsift:"): 3105 annotation_tool_initial = "snpsift" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 elif annotation_file.startswith("bigwig:"): 3108 annotation_tool_initial = "bigwig" 3109 annotation_file = ":".join(annotation_file.split(":")[1:]) 3110 else: 3111 annotation_tool_initial = None 3112 3113 # list of files 3114 annotation_file_list = annotation_file.replace("+", ":").split( 3115 ":" 3116 ) 3117 3118 for annotation_file in annotation_file_list: 3119 3120 if annotation_file: 3121 3122 # Annotation tool initial 3123 annotation_tool = annotation_tool_initial 3124 3125 # Find file 3126 annotation_file_found = None 3127 3128 if os.path.exists(annotation_file): 3129 annotation_file_found = annotation_file 3130 elif os.path.exists(full_path(annotation_file)): 3131 annotation_file_found = full_path(annotation_file) 3132 else: 3133 # Find within assembly folders 3134 for annotations_database in annotations_databases: 3135 found_files = find_all( 3136 annotation_file, 3137 os.path.join( 3138 annotations_database, assembly 3139 ), 3140 ) 3141 if len(found_files) > 0: 3142 annotation_file_found = found_files[0] 3143 break 3144 if not annotation_file_found and not assembly: 3145 # Find within folders 3146 for ( 3147 annotations_database 3148 ) in annotations_databases: 3149 found_files = find_all( 3150 annotation_file, annotations_database 3151 ) 3152 if len(found_files) > 0: 3153 annotation_file_found = found_files[0] 3154 break 3155 log.debug( 3156 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3157 ) 3158 3159 # Full path 3160 annotation_file_found = full_path(annotation_file_found) 3161 3162 if annotation_file_found: 3163 3164 database = Database(database=annotation_file_found) 3165 quick_annotation_format = database.get_format() 3166 quick_annotation_is_compressed = ( 3167 database.is_compressed() 3168 ) 3169 quick_annotation_is_indexed = os.path.exists( 3170 f"{annotation_file_found}.tbi" 3171 ) 3172 bcftools_preference = False 3173 3174 # Check Annotation Tool 3175 if not annotation_tool: 3176 if ( 3177 bcftools_preference 3178 and quick_annotation_format 3179 in ["vcf", "bed"] 3180 and quick_annotation_is_compressed 3181 and quick_annotation_is_indexed 3182 ): 3183 annotation_tool = "bcftools" 3184 elif quick_annotation_format in [ 3185 "vcf", 3186 "bed", 3187 "tsv", 3188 "tsv", 3189 "csv", 3190 "json", 3191 "tbl", 3192 "parquet", 3193 "duckdb", 3194 ]: 3195 annotation_tool = "parquet" 3196 elif quick_annotation_format in ["bw"]: 3197 annotation_tool = "bigwig" 3198 else: 3199 log.error( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 raise ValueError( 3203 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3204 ) 3205 3206 log.debug( 3207 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3208 ) 3209 3210 # Annotation Tool dispatch 3211 if annotation_tool: 3212 if annotation_tool not in param["annotation"]: 3213 param["annotation"][annotation_tool] = {} 3214 if ( 3215 "annotations" 3216 not in param["annotation"][annotation_tool] 3217 ): 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ] = {} 3221 param["annotation"][annotation_tool][ 3222 "annotations" 3223 ][annotation_file_found] = annotations 3224 3225 else: 3226 log.warning( 3227 f"Quick Annotation File {annotation_file} does NOT exist" 3228 ) 3229 3230 self.set_param(param) 3231 3232 if param.get("annotation", None): 3233 log.info("Annotations") 3234 if param.get("annotation", {}).get("parquet", None): 3235 log.info("Annotations 'parquet'...") 3236 self.annotation_parquet() 3237 if param.get("annotation", {}).get("bcftools", None): 3238 log.info("Annotations 'bcftools'...") 3239 self.annotation_bcftools() 3240 if param.get("annotation", {}).get("snpsift", None): 3241 log.info("Annotations 'snpsift'...") 3242 self.annotation_snpsift() 3243 if param.get("annotation", {}).get("bigwig", None): 3244 log.info("Annotations 'bigwig'...") 3245 self.annotation_bigwig() 3246 if param.get("annotation", {}).get("annovar", None): 3247 log.info("Annotations 'annovar'...") 3248 self.annotation_annovar() 3249 if param.get("annotation", {}).get("snpeff", None): 3250 log.info("Annotations 'snpeff'...") 3251 self.annotation_snpeff() 3252 if param.get("annotation", {}).get("exomiser", None) is not None: 3253 log.info("Annotations 'exomiser'...") 3254 self.annotation_exomiser() 3255 if param.get("annotation", {}).get("splice", None) is not None: 3256 log.info("Annotations 'splice' ...") 3257 self.annotation_splice() 3258 3259 # Explode INFOS fields into table fields 3260 if self.get_explode_infos(): 3261 self.explode_infos( 3262 prefix=self.get_explode_infos_prefix(), 3263 fields=self.get_explode_infos_fields(), 3264 force=True, 3265 ) 3266 3267 def annotation_bigwig(self, threads: int = None) -> None: 3268 """ 3269 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3270 3271 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3272 number of threads to be used for parallel processing during the annotation process. If the 3273 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3274 threads to use based on the system configuration 3275 :type threads: int 3276 :return: True 3277 """ 3278 3279 # DEBUG 3280 log.debug("Start annotation with bigwig databases") 3281 3282 # # Threads 3283 # if not threads: 3284 # threads = self.get_threads() 3285 # log.debug("Threads: " + str(threads)) 3286 3287 # Config 3288 config = self.get_config() 3289 log.debug("Config: " + str(config)) 3290 3291 # Config - BCFTools databases folders 3292 databases_folders = set( 3293 self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("annotations", ["."]) 3297 + self.get_config() 3298 .get("folders", {}) 3299 .get("databases", {}) 3300 .get("bigwig", ["."]) 3301 ) 3302 log.debug("Databases annotations: " + str(databases_folders)) 3303 3304 # Param 3305 annotations = ( 3306 self.get_param() 3307 .get("annotation", {}) 3308 .get("bigwig", {}) 3309 .get("annotations", None) 3310 ) 3311 log.debug("Annotations: " + str(annotations)) 3312 3313 # Assembly 3314 assembly = self.get_param().get( 3315 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3316 ) 3317 3318 # Data 3319 table_variants = self.get_table_variants() 3320 3321 # Check if not empty 3322 log.debug("Check if not empty") 3323 sql_query_chromosomes = ( 3324 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3325 ) 3326 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3327 if not sql_query_chromosomes_df["count"][0]: 3328 log.info(f"VCF empty") 3329 return 3330 3331 # VCF header 3332 vcf_reader = self.get_header() 3333 log.debug("Initial header: " + str(vcf_reader.infos)) 3334 3335 # Existing annotations 3336 for vcf_annotation in self.get_header().infos: 3337 3338 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3339 log.debug( 3340 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3341 ) 3342 3343 if annotations: 3344 3345 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3346 3347 # Export VCF file 3348 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3349 3350 # annotation_bigwig_config 3351 annotation_bigwig_config_list = [] 3352 3353 for annotation in annotations: 3354 annotation_fields = annotations[annotation] 3355 3356 # Annotation Name 3357 annotation_name = os.path.basename(annotation) 3358 3359 if not annotation_fields: 3360 annotation_fields = {"INFO": None} 3361 3362 log.debug(f"Annotation '{annotation_name}'") 3363 log.debug( 3364 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3365 ) 3366 3367 # Create Database 3368 database = Database( 3369 database=annotation, 3370 databases_folders=databases_folders, 3371 assembly=assembly, 3372 ) 3373 3374 # Find files 3375 db_file = database.get_database() 3376 db_file = full_path(db_file) 3377 db_hdr_file = database.get_header_file() 3378 db_hdr_file = full_path(db_hdr_file) 3379 db_file_type = database.get_format() 3380 3381 # If db_file is http ? 3382 if database.get_database().startswith("http"): 3383 3384 # Datbase is HTTP URL 3385 db_file_is_http = True 3386 3387 # DB file keep as URL 3388 db_file = database.get_database() 3389 log.warning( 3390 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3391 ) 3392 3393 # Retrieve automatic annotation field name 3394 annotation_field = clean_annotation_field( 3395 os.path.basename(db_file).replace(".bw", "") 3396 ) 3397 log.debug( 3398 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3399 ) 3400 3401 # Create automatic header file 3402 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3403 with open(db_hdr_file, "w") as f: 3404 f.write("##fileformat=VCFv4.2\n") 3405 f.write( 3406 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3407 ) 3408 f.write(f"#CHROM START END {annotation_field}\n") 3409 3410 else: 3411 3412 # Datbase is NOT HTTP URL 3413 db_file_is_http = False 3414 3415 # Check index - try to create if not exists 3416 if ( 3417 db_file is None 3418 or db_hdr_file is None 3419 or (not os.path.exists(db_file) and not db_file_is_http) 3420 or not os.path.exists(db_hdr_file) 3421 or not db_file_type in ["bw"] 3422 ): 3423 # if False: 3424 log.error("Annotation failed: database not valid") 3425 log.error(f"Annotation annotation file: {db_file}") 3426 log.error(f"Annotation annotation file type: {db_file_type}") 3427 log.error(f"Annotation annotation header: {db_hdr_file}") 3428 raise ValueError( 3429 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3430 ) 3431 else: 3432 3433 # Log 3434 log.debug( 3435 f"Annotation '{annotation}' - file: " 3436 + str(db_file) 3437 + " and " 3438 + str(db_hdr_file) 3439 ) 3440 3441 # Load header as VCF object 3442 db_hdr_vcf = Variants(input=db_hdr_file) 3443 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3444 log.debug( 3445 "Annotation database header: " 3446 + str(db_hdr_vcf_header_infos) 3447 ) 3448 3449 # For all fields in database 3450 annotation_fields_full = False 3451 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3452 annotation_fields = { 3453 key: key for key in db_hdr_vcf_header_infos 3454 } 3455 log.debug( 3456 "Annotation database header - All annotations added: " 3457 + str(annotation_fields) 3458 ) 3459 annotation_fields_full = True 3460 3461 # Init 3462 cyvcf2_header_rename_dict = {} 3463 cyvcf2_header_list = [] 3464 cyvcf2_header_indexes = {} 3465 3466 # process annotation fields 3467 for annotation_field in annotation_fields: 3468 3469 # New annotation name 3470 annotation_field_new = annotation_fields[annotation_field] 3471 3472 # Check annotation field and index in header 3473 if ( 3474 annotation_field 3475 in db_hdr_vcf.get_header_columns_as_list() 3476 ): 3477 annotation_field_index = ( 3478 db_hdr_vcf.get_header_columns_as_list().index( 3479 annotation_field 3480 ) 3481 - 3 3482 ) 3483 cyvcf2_header_indexes[annotation_field_new] = ( 3484 annotation_field_index 3485 ) 3486 else: 3487 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3488 log.error(msg_err) 3489 raise ValueError(msg_err) 3490 3491 # Append annotation field in cyvcf2 header list 3492 cyvcf2_header_rename_dict[annotation_field_new] = ( 3493 db_hdr_vcf_header_infos[annotation_field].id 3494 ) 3495 cyvcf2_header_list.append( 3496 { 3497 "ID": annotation_field_new, 3498 "Number": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].num, 3501 "Type": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].type, 3504 "Description": db_hdr_vcf_header_infos[ 3505 annotation_field 3506 ].desc, 3507 } 3508 ) 3509 3510 # Add header on VCF 3511 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3512 annotation_field_new, 3513 db_hdr_vcf_header_infos[annotation_field].num, 3514 db_hdr_vcf_header_infos[annotation_field].type, 3515 db_hdr_vcf_header_infos[annotation_field].desc, 3516 "HOWARD BigWig annotation", 3517 "unknown", 3518 self.code_type_map[ 3519 db_hdr_vcf_header_infos[annotation_field].type 3520 ], 3521 ) 3522 3523 # Load bigwig database 3524 bw_db = pyBigWig.open(db_file) 3525 if bw_db.isBigWig(): 3526 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3527 else: 3528 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3529 log.error(msg_err) 3530 raise ValueError(msg_err) 3531 3532 annotation_bigwig_config_list.append( 3533 { 3534 "db_file": db_file, 3535 "bw_db": bw_db, 3536 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3537 "cyvcf2_header_list": cyvcf2_header_list, 3538 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3539 } 3540 ) 3541 3542 # Annotate 3543 if annotation_bigwig_config_list: 3544 3545 # Annotation config 3546 log.debug( 3547 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3548 ) 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 3558 # Load input tmp file 3559 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3560 3561 # Add header in input file 3562 for annotation_bigwig_config in annotation_bigwig_config_list: 3563 for cyvcf2_header_field in annotation_bigwig_config.get( 3564 "cyvcf2_header_list", [] 3565 ): 3566 log.info( 3567 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3568 ) 3569 input_vcf.add_info_to_header(cyvcf2_header_field) 3570 3571 # Create output VCF file 3572 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3573 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3574 3575 # Fetch variants 3576 log.info(f"Annotations 'bigwig' start...") 3577 for variant in input_vcf: 3578 3579 for annotation_bigwig_config in annotation_bigwig_config_list: 3580 3581 # DB and indexes 3582 bw_db = annotation_bigwig_config.get("bw_db", None) 3583 cyvcf2_header_indexes = annotation_bigwig_config.get( 3584 "cyvcf2_header_indexes", None 3585 ) 3586 3587 # Retrieve value from chrom pos 3588 res = bw_db.values( 3589 variant.CHROM, variant.POS - 1, variant.POS 3590 ) 3591 3592 # For each annotation fields (and indexes) 3593 for cyvcf2_header_index in cyvcf2_header_indexes: 3594 3595 # If value is NOT nNone 3596 if not np.isnan( 3597 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3598 ): 3599 variant.INFO[cyvcf2_header_index] = res[ 3600 cyvcf2_header_indexes[cyvcf2_header_index] 3601 ] 3602 3603 # Add record in output file 3604 output_vcf.write_record(variant) 3605 3606 # Log 3607 log.debug(f"Annotation done.") 3608 3609 # Close and write file 3610 log.info(f"Annotations 'bigwig' write...") 3611 output_vcf.close() 3612 log.debug(f"Write done.") 3613 3614 # Update variants 3615 log.info(f"Annotations 'bigwig' update...") 3616 self.update_from_vcf(output_vcf_file) 3617 log.debug(f"Update done.") 3618 3619 return True 3620 3621 def annotation_snpsift(self, threads: int = None) -> None: 3622 """ 3623 This function annotate with bcftools 3624 3625 :param threads: Number of threads to use 3626 :return: the value of the variable "return_value". 3627 """ 3628 3629 # DEBUG 3630 log.debug("Start annotation with bcftools databases") 3631 3632 # Threads 3633 if not threads: 3634 threads = self.get_threads() 3635 log.debug("Threads: " + str(threads)) 3636 3637 # Config 3638 config = self.get_config() 3639 log.debug("Config: " + str(config)) 3640 3641 # Config - snpSift 3642 snpsift_bin_command = get_bin_command( 3643 bin="SnpSift.jar", 3644 tool="snpsift", 3645 bin_type="jar", 3646 config=config, 3647 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3648 ) 3649 if not snpsift_bin_command: 3650 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3651 log.error(msg_err) 3652 raise ValueError(msg_err) 3653 3654 # Config - bcftools 3655 bcftools_bin_command = get_bin_command( 3656 bin="bcftools", 3657 tool="bcftools", 3658 bin_type="bin", 3659 config=config, 3660 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3661 ) 3662 if not bcftools_bin_command: 3663 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3664 log.error(msg_err) 3665 raise ValueError(msg_err) 3666 3667 # Config - BCFTools databases folders 3668 databases_folders = set( 3669 self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("annotations", ["."]) 3673 + self.get_config() 3674 .get("folders", {}) 3675 .get("databases", {}) 3676 .get("bcftools", ["."]) 3677 ) 3678 log.debug("Databases annotations: " + str(databases_folders)) 3679 3680 # Param 3681 annotations = ( 3682 self.get_param() 3683 .get("annotation", {}) 3684 .get("snpsift", {}) 3685 .get("annotations", None) 3686 ) 3687 log.debug("Annotations: " + str(annotations)) 3688 3689 # Assembly 3690 assembly = self.get_param().get( 3691 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3692 ) 3693 3694 # Data 3695 table_variants = self.get_table_variants() 3696 3697 # Check if not empty 3698 log.debug("Check if not empty") 3699 sql_query_chromosomes = ( 3700 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3701 ) 3702 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3703 if not sql_query_chromosomes_df["count"][0]: 3704 log.info(f"VCF empty") 3705 return 3706 3707 # VCF header 3708 vcf_reader = self.get_header() 3709 log.debug("Initial header: " + str(vcf_reader.infos)) 3710 3711 # Existing annotations 3712 for vcf_annotation in self.get_header().infos: 3713 3714 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3715 log.debug( 3716 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3717 ) 3718 3719 if annotations: 3720 3721 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3722 3723 # Export VCF file 3724 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3725 3726 # Init 3727 commands = {} 3728 3729 for annotation in annotations: 3730 annotation_fields = annotations[annotation] 3731 3732 # Annotation Name 3733 annotation_name = os.path.basename(annotation) 3734 3735 if not annotation_fields: 3736 annotation_fields = {"INFO": None} 3737 3738 log.debug(f"Annotation '{annotation_name}'") 3739 log.debug( 3740 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3741 ) 3742 3743 # Create Database 3744 database = Database( 3745 database=annotation, 3746 databases_folders=databases_folders, 3747 assembly=assembly, 3748 ) 3749 3750 # Find files 3751 db_file = database.get_database() 3752 db_file = full_path(db_file) 3753 db_hdr_file = database.get_header_file() 3754 db_hdr_file = full_path(db_hdr_file) 3755 db_file_type = database.get_format() 3756 db_tbi_file = f"{db_file}.tbi" 3757 db_file_compressed = database.is_compressed() 3758 3759 # Check if compressed 3760 if not db_file_compressed: 3761 log.error( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 raise ValueError( 3765 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3766 ) 3767 3768 # Check if indexed 3769 if not os.path.exists(db_tbi_file): 3770 log.error( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 raise ValueError( 3774 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3775 ) 3776 3777 # Check index - try to create if not exists 3778 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3779 log.error("Annotation failed: database not valid") 3780 log.error(f"Annotation annotation file: {db_file}") 3781 log.error(f"Annotation annotation header: {db_hdr_file}") 3782 log.error(f"Annotation annotation index: {db_tbi_file}") 3783 raise ValueError( 3784 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3785 ) 3786 else: 3787 3788 log.debug( 3789 f"Annotation '{annotation}' - file: " 3790 + str(db_file) 3791 + " and " 3792 + str(db_hdr_file) 3793 ) 3794 3795 # Load header as VCF object 3796 db_hdr_vcf = Variants(input=db_hdr_file) 3797 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3798 log.debug( 3799 "Annotation database header: " 3800 + str(db_hdr_vcf_header_infos) 3801 ) 3802 3803 # For all fields in database 3804 annotation_fields_full = False 3805 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3806 annotation_fields = { 3807 key: key for key in db_hdr_vcf_header_infos 3808 } 3809 log.debug( 3810 "Annotation database header - All annotations added: " 3811 + str(annotation_fields) 3812 ) 3813 annotation_fields_full = True 3814 3815 # # Create file for field rename 3816 # log.debug("Create file for field rename") 3817 # tmp_rename = NamedTemporaryFile( 3818 # prefix=self.get_prefix(), 3819 # dir=self.get_tmp_dir(), 3820 # suffix=".rename", 3821 # delete=False, 3822 # ) 3823 # tmp_rename_name = tmp_rename.name 3824 # tmp_files.append(tmp_rename_name) 3825 3826 # Number of fields 3827 nb_annotation_field = 0 3828 annotation_list = [] 3829 annotation_infos_rename_list = [] 3830 3831 for annotation_field in annotation_fields: 3832 3833 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3834 annotation_fields_new_name = annotation_fields.get( 3835 annotation_field, annotation_field 3836 ) 3837 if not annotation_fields_new_name: 3838 annotation_fields_new_name = annotation_field 3839 3840 # Check if field is in DB and if field is not elready in input data 3841 if ( 3842 annotation_field in db_hdr_vcf.get_header().infos 3843 and annotation_fields_new_name 3844 not in self.get_header().infos 3845 ): 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3849 ) 3850 3851 # BCFTools annotate param to rename fields 3852 if annotation_field != annotation_fields_new_name: 3853 annotation_infos_rename_list.append( 3854 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3855 ) 3856 3857 # Add INFO field to header 3858 db_hdr_vcf_header_infos_number = ( 3859 db_hdr_vcf_header_infos[annotation_field].num or "." 3860 ) 3861 db_hdr_vcf_header_infos_type = ( 3862 db_hdr_vcf_header_infos[annotation_field].type 3863 or "String" 3864 ) 3865 db_hdr_vcf_header_infos_description = ( 3866 db_hdr_vcf_header_infos[annotation_field].desc 3867 or f"{annotation_field} description" 3868 ) 3869 db_hdr_vcf_header_infos_source = ( 3870 db_hdr_vcf_header_infos[annotation_field].source 3871 or "unknown" 3872 ) 3873 db_hdr_vcf_header_infos_version = ( 3874 db_hdr_vcf_header_infos[annotation_field].version 3875 or "unknown" 3876 ) 3877 3878 vcf_reader.infos[annotation_fields_new_name] = ( 3879 vcf.parser._Info( 3880 annotation_fields_new_name, 3881 db_hdr_vcf_header_infos_number, 3882 db_hdr_vcf_header_infos_type, 3883 db_hdr_vcf_header_infos_description, 3884 db_hdr_vcf_header_infos_source, 3885 db_hdr_vcf_header_infos_version, 3886 self.code_type_map[ 3887 db_hdr_vcf_header_infos_type 3888 ], 3889 ) 3890 ) 3891 3892 annotation_list.append(annotation_field) 3893 3894 nb_annotation_field += 1 3895 3896 else: 3897 3898 if ( 3899 annotation_field 3900 not in db_hdr_vcf.get_header().infos 3901 ): 3902 log.warning( 3903 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3904 ) 3905 if ( 3906 annotation_fields_new_name 3907 in self.get_header().infos 3908 ): 3909 log.warning( 3910 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3911 ) 3912 3913 log.info( 3914 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3915 ) 3916 3917 annotation_infos = ",".join(annotation_list) 3918 3919 if annotation_infos != "": 3920 3921 # Annotated VCF (and error file) 3922 tmp_annotation_vcf_name = os.path.join( 3923 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3924 ) 3925 tmp_annotation_vcf_name_err = ( 3926 tmp_annotation_vcf_name + ".err" 3927 ) 3928 3929 # Add fields to annotate 3930 if not annotation_fields_full: 3931 annotation_infos_option = f"-info {annotation_infos}" 3932 else: 3933 annotation_infos_option = "" 3934 3935 # Info fields rename 3936 if annotation_infos_rename_list: 3937 annotation_infos_rename = " -c " + ",".join( 3938 annotation_infos_rename_list 3939 ) 3940 else: 3941 annotation_infos_rename = "" 3942 3943 # Annotate command 3944 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3945 3946 # Add command 3947 commands[command_annotate] = tmp_annotation_vcf_name 3948 3949 if commands: 3950 3951 # Export VCF file 3952 self.export_variant_vcf( 3953 vcf_file=tmp_vcf_name, 3954 remove_info=True, 3955 add_samples=False, 3956 index=True, 3957 ) 3958 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3959 3960 # Num command 3961 nb_command = 0 3962 3963 # Annotate 3964 for command_annotate in commands: 3965 nb_command += 1 3966 log.info( 3967 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3968 ) 3969 log.debug(f"command_annotate={command_annotate}") 3970 run_parallel_commands([command_annotate], threads) 3971 3972 # Debug 3973 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3974 3975 # Update variants 3976 log.info( 3977 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3978 ) 3979 self.update_from_vcf(commands[command_annotate]) 3980 3981 def annotation_bcftools(self, threads: int = None) -> None: 3982 """ 3983 This function annotate with bcftools 3984 3985 :param threads: Number of threads to use 3986 :return: the value of the variable "return_value". 3987 """ 3988 3989 # DEBUG 3990 log.debug("Start annotation with bcftools databases") 3991 3992 # Threads 3993 if not threads: 3994 threads = self.get_threads() 3995 log.debug("Threads: " + str(threads)) 3996 3997 # Config 3998 config = self.get_config() 3999 log.debug("Config: " + str(config)) 4000 4001 # DEBUG 4002 delete_tmp = True 4003 if self.get_config().get("verbosity", "warning") in ["debug"]: 4004 delete_tmp = False 4005 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4006 4007 # Config - BCFTools bin command 4008 bcftools_bin_command = get_bin_command( 4009 bin="bcftools", 4010 tool="bcftools", 4011 bin_type="bin", 4012 config=config, 4013 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4014 ) 4015 if not bcftools_bin_command: 4016 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4017 log.error(msg_err) 4018 raise ValueError(msg_err) 4019 4020 # Config - BCFTools databases folders 4021 databases_folders = set( 4022 self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("annotations", ["."]) 4026 + self.get_config() 4027 .get("folders", {}) 4028 .get("databases", {}) 4029 .get("bcftools", ["."]) 4030 ) 4031 log.debug("Databases annotations: " + str(databases_folders)) 4032 4033 # Param 4034 annotations = ( 4035 self.get_param() 4036 .get("annotation", {}) 4037 .get("bcftools", {}) 4038 .get("annotations", None) 4039 ) 4040 log.debug("Annotations: " + str(annotations)) 4041 4042 # Assembly 4043 assembly = self.get_param().get( 4044 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4045 ) 4046 4047 # Data 4048 table_variants = self.get_table_variants() 4049 4050 # Check if not empty 4051 log.debug("Check if not empty") 4052 sql_query_chromosomes = ( 4053 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4054 ) 4055 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4056 if not sql_query_chromosomes_df["count"][0]: 4057 log.info(f"VCF empty") 4058 return 4059 4060 # Export in VCF 4061 log.debug("Create initial file to annotate") 4062 tmp_vcf = NamedTemporaryFile( 4063 prefix=self.get_prefix(), 4064 dir=self.get_tmp_dir(), 4065 suffix=".vcf.gz", 4066 delete=False, 4067 ) 4068 tmp_vcf_name = tmp_vcf.name 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Existing annotations 4075 for vcf_annotation in self.get_header().infos: 4076 4077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4078 log.debug( 4079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4080 ) 4081 4082 if annotations: 4083 4084 tmp_ann_vcf_list = [] 4085 commands = [] 4086 tmp_files = [] 4087 err_files = [] 4088 4089 for annotation in annotations: 4090 annotation_fields = annotations[annotation] 4091 4092 # Annotation Name 4093 annotation_name = os.path.basename(annotation) 4094 4095 if not annotation_fields: 4096 annotation_fields = {"INFO": None} 4097 4098 log.debug(f"Annotation '{annotation_name}'") 4099 log.debug( 4100 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4101 ) 4102 4103 # Create Database 4104 database = Database( 4105 database=annotation, 4106 databases_folders=databases_folders, 4107 assembly=assembly, 4108 ) 4109 4110 # Find files 4111 db_file = database.get_database() 4112 db_file = full_path(db_file) 4113 db_hdr_file = database.get_header_file() 4114 db_hdr_file = full_path(db_hdr_file) 4115 db_file_type = database.get_format() 4116 db_tbi_file = f"{db_file}.tbi" 4117 db_file_compressed = database.is_compressed() 4118 4119 # Check if compressed 4120 if not db_file_compressed: 4121 log.error( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 raise ValueError( 4125 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4126 ) 4127 4128 # Check if indexed 4129 if not os.path.exists(db_tbi_file): 4130 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4131 raise ValueError( 4132 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4133 ) 4134 4135 # Check index - try to create if not exists 4136 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4137 log.error("Annotation failed: database not valid") 4138 log.error(f"Annotation annotation file: {db_file}") 4139 log.error(f"Annotation annotation header: {db_hdr_file}") 4140 log.error(f"Annotation annotation index: {db_tbi_file}") 4141 raise ValueError( 4142 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4143 ) 4144 else: 4145 4146 log.debug( 4147 f"Annotation '{annotation}' - file: " 4148 + str(db_file) 4149 + " and " 4150 + str(db_hdr_file) 4151 ) 4152 4153 # Load header as VCF object 4154 db_hdr_vcf = Variants(input=db_hdr_file) 4155 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4156 log.debug( 4157 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4158 ) 4159 4160 # For all fields in database 4161 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4162 annotation_fields = { 4163 key: key for key in db_hdr_vcf_header_infos 4164 } 4165 log.debug( 4166 "Annotation database header - All annotations added: " 4167 + str(annotation_fields) 4168 ) 4169 4170 # Number of fields 4171 nb_annotation_field = 0 4172 annotation_list = [] 4173 4174 for annotation_field in annotation_fields: 4175 4176 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4177 annotation_fields_new_name = annotation_fields.get( 4178 annotation_field, annotation_field 4179 ) 4180 if not annotation_fields_new_name: 4181 annotation_fields_new_name = annotation_field 4182 4183 # Check if field is in DB and if field is not elready in input data 4184 if ( 4185 annotation_field in db_hdr_vcf.get_header().infos 4186 and annotation_fields_new_name 4187 not in self.get_header().infos 4188 ): 4189 4190 log.info( 4191 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4192 ) 4193 4194 # Add INFO field to header 4195 db_hdr_vcf_header_infos_number = ( 4196 db_hdr_vcf_header_infos[annotation_field].num or "." 4197 ) 4198 db_hdr_vcf_header_infos_type = ( 4199 db_hdr_vcf_header_infos[annotation_field].type 4200 or "String" 4201 ) 4202 db_hdr_vcf_header_infos_description = ( 4203 db_hdr_vcf_header_infos[annotation_field].desc 4204 or f"{annotation_field} description" 4205 ) 4206 db_hdr_vcf_header_infos_source = ( 4207 db_hdr_vcf_header_infos[annotation_field].source 4208 or "unknown" 4209 ) 4210 db_hdr_vcf_header_infos_version = ( 4211 db_hdr_vcf_header_infos[annotation_field].version 4212 or "unknown" 4213 ) 4214 4215 vcf_reader.infos[annotation_fields_new_name] = ( 4216 vcf.parser._Info( 4217 annotation_fields_new_name, 4218 db_hdr_vcf_header_infos_number, 4219 db_hdr_vcf_header_infos_type, 4220 db_hdr_vcf_header_infos_description, 4221 db_hdr_vcf_header_infos_source, 4222 db_hdr_vcf_header_infos_version, 4223 self.code_type_map[db_hdr_vcf_header_infos_type], 4224 ) 4225 ) 4226 4227 # annotation_list.append(annotation_field) 4228 if annotation_field != annotation_fields_new_name: 4229 annotation_list.append( 4230 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4231 ) 4232 else: 4233 annotation_list.append(annotation_field) 4234 4235 nb_annotation_field += 1 4236 4237 else: 4238 4239 if annotation_field not in db_hdr_vcf.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4242 ) 4243 if annotation_fields_new_name in self.get_header().infos: 4244 log.warning( 4245 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4246 ) 4247 4248 log.info( 4249 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4250 ) 4251 4252 annotation_infos = ",".join(annotation_list) 4253 4254 if annotation_infos != "": 4255 4256 # Protect header for bcftools (remove "#CHROM" and variants line) 4257 log.debug("Protect Header file - remove #CHROM line if exists") 4258 tmp_header_vcf = NamedTemporaryFile( 4259 prefix=self.get_prefix(), 4260 dir=self.get_tmp_dir(), 4261 suffix=".hdr", 4262 delete=False, 4263 ) 4264 tmp_header_vcf_name = tmp_header_vcf.name 4265 tmp_files.append(tmp_header_vcf_name) 4266 # Command 4267 if db_hdr_file.endswith(".gz"): 4268 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 else: 4270 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4271 # Run 4272 run_parallel_commands([command_extract_header], 1) 4273 4274 # Find chomosomes 4275 log.debug("Find chromosomes ") 4276 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4277 sql_query_chromosomes_df = self.get_query_to_df( 4278 sql_query_chromosomes 4279 ) 4280 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4281 4282 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4283 4284 # BED columns in the annotation file 4285 if db_file_type in ["bed"]: 4286 annotation_infos = "CHROM,POS,POS," + annotation_infos 4287 4288 for chrom in chomosomes_list: 4289 4290 # Create BED on initial VCF 4291 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4292 tmp_bed = NamedTemporaryFile( 4293 prefix=self.get_prefix(), 4294 dir=self.get_tmp_dir(), 4295 suffix=".bed", 4296 delete=False, 4297 ) 4298 tmp_bed_name = tmp_bed.name 4299 tmp_files.append(tmp_bed_name) 4300 4301 # Detecte regions 4302 log.debug( 4303 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4304 ) 4305 window = 1000000 4306 sql_query_intervals_for_bed = f""" 4307 SELECT \"#CHROM\", 4308 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4309 \"POS\"+{window} 4310 FROM {table_variants} as table_variants 4311 WHERE table_variants.\"#CHROM\" = '{chrom}' 4312 """ 4313 regions = self.conn.execute( 4314 sql_query_intervals_for_bed 4315 ).fetchall() 4316 merged_regions = merge_regions(regions) 4317 log.debug( 4318 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4319 ) 4320 4321 header = ["#CHROM", "START", "END"] 4322 with open(tmp_bed_name, "w") as f: 4323 # Write the header with tab delimiter 4324 f.write("\t".join(header) + "\n") 4325 for d in merged_regions: 4326 # Write each data row with tab delimiter 4327 f.write("\t".join(map(str, d)) + "\n") 4328 4329 # Tmp files 4330 tmp_annotation_vcf = NamedTemporaryFile( 4331 prefix=self.get_prefix(), 4332 dir=self.get_tmp_dir(), 4333 suffix=".vcf.gz", 4334 delete=False, 4335 ) 4336 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4337 tmp_files.append(tmp_annotation_vcf_name) 4338 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4339 tmp_annotation_vcf_name_err = ( 4340 tmp_annotation_vcf_name + ".err" 4341 ) 4342 err_files.append(tmp_annotation_vcf_name_err) 4343 4344 # Annotate Command 4345 log.debug( 4346 f"Annotation '{annotation}' - add bcftools command" 4347 ) 4348 4349 # Command 4350 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4351 4352 # Add command 4353 commands.append(command_annotate) 4354 4355 # if some commands 4356 if commands: 4357 4358 # Export VCF file 4359 self.export_variant_vcf( 4360 vcf_file=tmp_vcf_name, 4361 remove_info=True, 4362 add_samples=False, 4363 index=True, 4364 ) 4365 4366 # Threads 4367 # calculate threads for annotated commands 4368 if commands: 4369 threads_bcftools_annotate = round(threads / len(commands)) 4370 else: 4371 threads_bcftools_annotate = 1 4372 4373 if not threads_bcftools_annotate: 4374 threads_bcftools_annotate = 1 4375 4376 # Add threads option to bcftools commands 4377 if threads_bcftools_annotate > 1: 4378 commands_threaded = [] 4379 for command in commands: 4380 commands_threaded.append( 4381 command.replace( 4382 f"{bcftools_bin_command} annotate ", 4383 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4384 ) 4385 ) 4386 commands = commands_threaded 4387 4388 # Command annotation multithreading 4389 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4390 log.info( 4391 f"Annotation - Annotation multithreaded in " 4392 + str(len(commands)) 4393 + " commands" 4394 ) 4395 4396 run_parallel_commands(commands, threads) 4397 4398 # Merge 4399 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4400 4401 if tmp_ann_vcf_list_cmd: 4402 4403 # Tmp file 4404 tmp_annotate_vcf = NamedTemporaryFile( 4405 prefix=self.get_prefix(), 4406 dir=self.get_tmp_dir(), 4407 suffix=".vcf.gz", 4408 delete=True, 4409 ) 4410 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4411 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4412 err_files.append(tmp_annotate_vcf_name_err) 4413 4414 # Tmp file remove command 4415 tmp_files_remove_command = "" 4416 if tmp_files: 4417 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4418 4419 # Command merge 4420 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4421 log.info( 4422 f"Annotation - Annotation merging " 4423 + str(len(commands)) 4424 + " annotated files" 4425 ) 4426 log.debug(f"Annotation - merge command: {merge_command}") 4427 run_parallel_commands([merge_command], 1) 4428 4429 # Error messages 4430 log.info(f"Error/Warning messages:") 4431 error_message_command_all = [] 4432 error_message_command_warning = [] 4433 error_message_command_err = [] 4434 for err_file in err_files: 4435 with open(err_file, "r") as f: 4436 for line in f: 4437 message = line.strip() 4438 error_message_command_all.append(message) 4439 if line.startswith("[W::"): 4440 error_message_command_warning.append(message) 4441 if line.startswith("[E::"): 4442 error_message_command_err.append( 4443 f"{err_file}: " + message 4444 ) 4445 # log info 4446 for message in list( 4447 set(error_message_command_err + error_message_command_warning) 4448 ): 4449 log.info(f" {message}") 4450 # debug info 4451 for message in list(set(error_message_command_all)): 4452 log.debug(f" {message}") 4453 # failed 4454 if len(error_message_command_err): 4455 log.error("Annotation failed: Error in commands") 4456 raise ValueError("Annotation failed: Error in commands") 4457 4458 # Update variants 4459 log.info(f"Annotation - Updating...") 4460 self.update_from_vcf(tmp_annotate_vcf_name) 4461 4462 def annotation_exomiser(self, threads: int = None) -> None: 4463 """ 4464 This function annotate with Exomiser 4465 4466 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4467 - "analysis" (dict/file): 4468 Full analysis dictionnary parameters (see Exomiser docs). 4469 Either a dict, or a file in JSON or YAML format. 4470 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4471 Default : None 4472 - "preset" (string): 4473 Analysis preset (available in config folder). 4474 Used if no full "analysis" is provided. 4475 Default: "exome" 4476 - "phenopacket" (dict/file): 4477 Samples and phenotipic features parameters (see Exomiser docs). 4478 Either a dict, or a file in JSON or YAML format. 4479 Default: None 4480 - "subject" (dict): 4481 Sample parameters (see Exomiser docs). 4482 Example: 4483 "subject": 4484 { 4485 "id": "ISDBM322017", 4486 "sex": "FEMALE" 4487 } 4488 Default: None 4489 - "sample" (string): 4490 Sample name to construct "subject" section: 4491 "subject": 4492 { 4493 "id": "<sample>", 4494 "sex": "UNKNOWN_SEX" 4495 } 4496 Default: None 4497 - "phenotypicFeatures" (dict) 4498 Phenotypic features to construct "subject" section. 4499 Example: 4500 "phenotypicFeatures": 4501 [ 4502 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4503 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4504 ] 4505 - "hpo" (list) 4506 List of HPO ids as phenotypic features. 4507 Example: 4508 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4509 Default: [] 4510 - "outputOptions" (dict): 4511 Output options (see Exomiser docs). 4512 Default: 4513 "output_options" = 4514 { 4515 "outputContributingVariantsOnly": False, 4516 "numGenes": 0, 4517 "outputFormats": ["TSV_VARIANT", "VCF"] 4518 } 4519 - "transcript_source" (string): 4520 Transcript source (either "refseq", "ucsc", "ensembl") 4521 Default: "refseq" 4522 - "exomiser_to_info" (boolean): 4523 Add exomiser TSV file columns as INFO fields in VCF. 4524 Default: False 4525 - "release" (string): 4526 Exomise database release. 4527 If not exists, database release will be downloaded (take a while). 4528 Default: None (provided by application.properties configuration file) 4529 - "exomiser_application_properties" (file): 4530 Exomiser configuration file (see Exomiser docs). 4531 Useful to automatically download databases (especially for specific genome databases). 4532 4533 Notes: 4534 - If no sample in parameters, first sample in VCF will be chosen 4535 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4536 4537 :param threads: The number of threads to use 4538 :return: None. 4539 """ 4540 4541 # DEBUG 4542 log.debug("Start annotation with Exomiser databases") 4543 4544 # Threads 4545 if not threads: 4546 threads = self.get_threads() 4547 log.debug("Threads: " + str(threads)) 4548 4549 # Config 4550 config = self.get_config() 4551 log.debug("Config: " + str(config)) 4552 4553 # Config - Folders - Databases 4554 databases_folders = ( 4555 config.get("folders", {}) 4556 .get("databases", {}) 4557 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4558 ) 4559 databases_folders = full_path(databases_folders) 4560 if not os.path.exists(databases_folders): 4561 log.error(f"Databases annotations: {databases_folders} NOT found") 4562 log.debug("Databases annotations: " + str(databases_folders)) 4563 4564 # Config - Exomiser 4565 exomiser_bin_command = get_bin_command( 4566 bin="exomiser-cli*.jar", 4567 tool="exomiser", 4568 bin_type="jar", 4569 config=config, 4570 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4571 ) 4572 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4573 if not exomiser_bin_command: 4574 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4575 log.error(msg_err) 4576 raise ValueError(msg_err) 4577 4578 # Param 4579 param = self.get_param() 4580 log.debug("Param: " + str(param)) 4581 4582 # Param - Exomiser 4583 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4584 log.debug(f"Param Exomiser: {param_exomiser}") 4585 4586 # Param - Assembly 4587 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4588 log.debug("Assembly: " + str(assembly)) 4589 4590 # Data 4591 table_variants = self.get_table_variants() 4592 4593 # Check if not empty 4594 log.debug("Check if not empty") 4595 sql_query_chromosomes = ( 4596 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4597 ) 4598 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4599 log.info(f"VCF empty") 4600 return False 4601 4602 # VCF header 4603 vcf_reader = self.get_header() 4604 log.debug("Initial header: " + str(vcf_reader.infos)) 4605 4606 # Samples 4607 samples = self.get_header_sample_list() 4608 if not samples: 4609 log.error("No Samples in VCF") 4610 return False 4611 log.debug(f"Samples: {samples}") 4612 4613 # Memory limit 4614 memory_limit = self.get_memory("8G") 4615 log.debug(f"memory_limit: {memory_limit}") 4616 4617 # Exomiser java options 4618 exomiser_java_options = ( 4619 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4620 ) 4621 log.debug(f"Exomiser java options: {exomiser_java_options}") 4622 4623 # Download Exomiser (if not exists) 4624 exomiser_release = param_exomiser.get("release", None) 4625 exomiser_application_properties = param_exomiser.get( 4626 "exomiser_application_properties", None 4627 ) 4628 databases_download_exomiser( 4629 assemblies=[assembly], 4630 exomiser_folder=databases_folders, 4631 exomiser_release=exomiser_release, 4632 exomiser_phenotype_release=exomiser_release, 4633 exomiser_application_properties=exomiser_application_properties, 4634 ) 4635 4636 # Force annotation 4637 force_update_annotation = True 4638 4639 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4640 log.debug("Start annotation Exomiser") 4641 4642 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4643 4644 # tmp_dir = "/tmp/exomiser" 4645 4646 ### ANALYSIS ### 4647 ################ 4648 4649 # Create analysis.json through analysis dict 4650 # either analysis in param or by default 4651 # depending on preset exome/genome) 4652 4653 # Init analysis dict 4654 param_exomiser_analysis_dict = {} 4655 4656 # analysis from param 4657 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4658 param_exomiser_analysis = full_path(param_exomiser_analysis) 4659 4660 # If analysis in param -> load anlaysis json 4661 if param_exomiser_analysis: 4662 4663 # If param analysis is a file and exists 4664 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4665 param_exomiser_analysis 4666 ): 4667 # Load analysis file into analysis dict (either yaml or json) 4668 with open(param_exomiser_analysis) as json_file: 4669 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4670 4671 # If param analysis is a dict 4672 elif isinstance(param_exomiser_analysis, dict): 4673 # Load analysis dict into analysis dict (either yaml or json) 4674 param_exomiser_analysis_dict = param_exomiser_analysis 4675 4676 # Error analysis type 4677 else: 4678 log.error(f"Analysis type unknown. Check param file.") 4679 raise ValueError(f"Analysis type unknown. Check param file.") 4680 4681 # Case no input analysis config file/dict 4682 # Use preset (exome/genome) to open default config file 4683 if not param_exomiser_analysis_dict: 4684 4685 # default preset 4686 default_preset = "exome" 4687 4688 # Get param preset or default preset 4689 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4690 4691 # Try to find if preset is a file 4692 if os.path.exists(param_exomiser_preset): 4693 # Preset file is provided in full path 4694 param_exomiser_analysis_default_config_file = ( 4695 param_exomiser_preset 4696 ) 4697 # elif os.path.exists(full_path(param_exomiser_preset)): 4698 # # Preset file is provided in full path 4699 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4700 elif os.path.exists( 4701 os.path.join(folder_config, param_exomiser_preset) 4702 ): 4703 # Preset file is provided a basename in config folder (can be a path with subfolders) 4704 param_exomiser_analysis_default_config_file = os.path.join( 4705 folder_config, param_exomiser_preset 4706 ) 4707 else: 4708 # Construct preset file 4709 param_exomiser_analysis_default_config_file = os.path.join( 4710 folder_config, 4711 f"preset-{param_exomiser_preset}-analysis.json", 4712 ) 4713 4714 # If preset file exists 4715 param_exomiser_analysis_default_config_file = full_path( 4716 param_exomiser_analysis_default_config_file 4717 ) 4718 if os.path.exists(param_exomiser_analysis_default_config_file): 4719 # Load prest file into analysis dict (either yaml or json) 4720 with open( 4721 param_exomiser_analysis_default_config_file 4722 ) as json_file: 4723 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4724 json_file 4725 ) 4726 4727 # Error preset file 4728 else: 4729 log.error( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 raise ValueError( 4733 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4734 ) 4735 4736 # If no analysis dict created 4737 if not param_exomiser_analysis_dict: 4738 log.error(f"No analysis config") 4739 raise ValueError(f"No analysis config") 4740 4741 # Log 4742 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4743 4744 ### PHENOPACKET ### 4745 ################### 4746 4747 # If no PhenoPacket in analysis dict -> check in param 4748 if "phenopacket" not in param_exomiser_analysis_dict: 4749 4750 # If PhenoPacket in param -> load anlaysis json 4751 if param_exomiser.get("phenopacket", None): 4752 4753 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4754 param_exomiser_phenopacket = full_path( 4755 param_exomiser_phenopacket 4756 ) 4757 4758 # If param phenopacket is a file and exists 4759 if isinstance( 4760 param_exomiser_phenopacket, str 4761 ) and os.path.exists(param_exomiser_phenopacket): 4762 # Load phenopacket file into analysis dict (either yaml or json) 4763 with open(param_exomiser_phenopacket) as json_file: 4764 param_exomiser_analysis_dict["phenopacket"] = ( 4765 yaml.safe_load(json_file) 4766 ) 4767 4768 # If param phenopacket is a dict 4769 elif isinstance(param_exomiser_phenopacket, dict): 4770 # Load phenopacket dict into analysis dict (either yaml or json) 4771 param_exomiser_analysis_dict["phenopacket"] = ( 4772 param_exomiser_phenopacket 4773 ) 4774 4775 # Error phenopacket type 4776 else: 4777 log.error(f"Phenopacket type unknown. Check param file.") 4778 raise ValueError( 4779 f"Phenopacket type unknown. Check param file." 4780 ) 4781 4782 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4783 if "phenopacket" not in param_exomiser_analysis_dict: 4784 4785 # Init PhenoPacket 4786 param_exomiser_analysis_dict["phenopacket"] = { 4787 "id": "analysis", 4788 "proband": {}, 4789 } 4790 4791 ### Add subject ### 4792 4793 # If subject exists 4794 param_exomiser_subject = param_exomiser.get("subject", {}) 4795 4796 # If subject not exists -> found sample ID 4797 if not param_exomiser_subject: 4798 4799 # Found sample ID in param 4800 sample = param_exomiser.get("sample", None) 4801 4802 # Find sample ID (first sample) 4803 if not sample: 4804 sample_list = self.get_header_sample_list() 4805 if len(sample_list) > 0: 4806 sample = sample_list[0] 4807 else: 4808 log.error(f"No sample found") 4809 raise ValueError(f"No sample found") 4810 4811 # Create subject 4812 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4813 4814 # Add to dict 4815 param_exomiser_analysis_dict["phenopacket"][ 4816 "subject" 4817 ] = param_exomiser_subject 4818 4819 ### Add "phenotypicFeatures" ### 4820 4821 # If phenotypicFeatures exists 4822 param_exomiser_phenotypicfeatures = param_exomiser.get( 4823 "phenotypicFeatures", [] 4824 ) 4825 4826 # If phenotypicFeatures not exists -> Try to infer from hpo list 4827 if not param_exomiser_phenotypicfeatures: 4828 4829 # Found HPO in param 4830 param_exomiser_hpo = param_exomiser.get("hpo", []) 4831 4832 # Split HPO if list in string format separated by comma 4833 if isinstance(param_exomiser_hpo, str): 4834 param_exomiser_hpo = param_exomiser_hpo.split(",") 4835 4836 # Create HPO list 4837 for hpo in param_exomiser_hpo: 4838 hpo_clean = re.sub("[^0-9]", "", hpo) 4839 param_exomiser_phenotypicfeatures.append( 4840 { 4841 "type": { 4842 "id": f"HP:{hpo_clean}", 4843 "label": f"HP:{hpo_clean}", 4844 } 4845 } 4846 ) 4847 4848 # Add to dict 4849 param_exomiser_analysis_dict["phenopacket"][ 4850 "phenotypicFeatures" 4851 ] = param_exomiser_phenotypicfeatures 4852 4853 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4854 if not param_exomiser_phenotypicfeatures: 4855 for step in param_exomiser_analysis_dict.get( 4856 "analysis", {} 4857 ).get("steps", []): 4858 if "hiPhivePrioritiser" in step: 4859 param_exomiser_analysis_dict.get("analysis", {}).get( 4860 "steps", [] 4861 ).remove(step) 4862 4863 ### Add Input File ### 4864 4865 # Initial file name and htsFiles 4866 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4867 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4868 { 4869 "uri": tmp_vcf_name, 4870 "htsFormat": "VCF", 4871 "genomeAssembly": assembly, 4872 } 4873 ] 4874 4875 ### Add metaData ### 4876 4877 # If metaData not in analysis dict 4878 if "metaData" not in param_exomiser_analysis_dict: 4879 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4880 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4881 "createdBy": "howard", 4882 "phenopacketSchemaVersion": 1, 4883 } 4884 4885 ### OutputOptions ### 4886 4887 # Init output result folder 4888 output_results = os.path.join(tmp_dir, "results") 4889 4890 # If no outputOptions in analysis dict 4891 if "outputOptions" not in param_exomiser_analysis_dict: 4892 4893 # default output formats 4894 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4895 4896 # Get outputOptions in param 4897 output_options = param_exomiser.get("outputOptions", None) 4898 4899 # If no output_options in param -> check 4900 if not output_options: 4901 output_options = { 4902 "outputContributingVariantsOnly": False, 4903 "numGenes": 0, 4904 "outputFormats": defaut_output_formats, 4905 } 4906 4907 # Replace outputDirectory in output options 4908 output_options["outputDirectory"] = output_results 4909 output_options["outputFileName"] = "howard" 4910 4911 # Add outputOptions in analysis dict 4912 param_exomiser_analysis_dict["outputOptions"] = output_options 4913 4914 else: 4915 4916 # Replace output_results and output format (if exists in param) 4917 param_exomiser_analysis_dict["outputOptions"][ 4918 "outputDirectory" 4919 ] = output_results 4920 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4921 list( 4922 set( 4923 param_exomiser_analysis_dict.get( 4924 "outputOptions", {} 4925 ).get("outputFormats", []) 4926 + ["TSV_VARIANT", "VCF"] 4927 ) 4928 ) 4929 ) 4930 4931 # log 4932 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4933 4934 ### ANALYSIS FILE ### 4935 ##################### 4936 4937 ### Full JSON analysis config file ### 4938 4939 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4940 with open(exomiser_analysis, "w") as fp: 4941 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4942 4943 ### SPLIT analysis and sample config files 4944 4945 # Splitted analysis dict 4946 param_exomiser_analysis_dict_for_split = ( 4947 param_exomiser_analysis_dict.copy() 4948 ) 4949 4950 # Phenopacket JSON file 4951 exomiser_analysis_phenopacket = os.path.join( 4952 tmp_dir, "analysis_phenopacket.json" 4953 ) 4954 with open(exomiser_analysis_phenopacket, "w") as fp: 4955 json.dump( 4956 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4957 fp, 4958 indent=4, 4959 ) 4960 4961 # Analysis JSON file without Phenopacket parameters 4962 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4963 exomiser_analysis_analysis = os.path.join( 4964 tmp_dir, "analysis_analysis.json" 4965 ) 4966 with open(exomiser_analysis_analysis, "w") as fp: 4967 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4968 4969 ### INITAL VCF file ### 4970 ####################### 4971 4972 ### Create list of samples to use and include inti initial VCF file #### 4973 4974 # Subject (main sample) 4975 # Get sample ID in analysis dict 4976 sample_subject = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("subject", {}) 4979 .get("id", None) 4980 ) 4981 sample_proband = ( 4982 param_exomiser_analysis_dict.get("phenopacket", {}) 4983 .get("proband", {}) 4984 .get("subject", {}) 4985 .get("id", None) 4986 ) 4987 sample = [] 4988 if sample_subject: 4989 sample.append(sample_subject) 4990 if sample_proband: 4991 sample.append(sample_proband) 4992 4993 # Get sample ID within Pedigree 4994 pedigree_persons_list = ( 4995 param_exomiser_analysis_dict.get("phenopacket", {}) 4996 .get("pedigree", {}) 4997 .get("persons", {}) 4998 ) 4999 5000 # Create list with all sample ID in pedigree (if exists) 5001 pedigree_persons = [] 5002 for person in pedigree_persons_list: 5003 pedigree_persons.append(person.get("individualId")) 5004 5005 # Concat subject sample ID and samples ID in pedigreesamples 5006 samples = list(set(sample + pedigree_persons)) 5007 5008 # Check if sample list is not empty 5009 if not samples: 5010 log.error(f"No samples found") 5011 raise ValueError(f"No samples found") 5012 5013 # Create VCF with sample (either sample in param or first one by default) 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=True, 5019 list_samples=samples, 5020 index=False, 5021 ) 5022 5023 ### Execute Exomiser ### 5024 ######################## 5025 5026 # Init command 5027 exomiser_command = "" 5028 5029 # Command exomiser options 5030 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5031 5032 # Release 5033 exomiser_release = param_exomiser.get("release", None) 5034 if exomiser_release: 5035 # phenotype data version 5036 exomiser_options += ( 5037 f" --exomiser.phenotype.data-version={exomiser_release} " 5038 ) 5039 # data version 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.data-version={exomiser_release} " 5042 ) 5043 # variant white list 5044 variant_white_list_file = ( 5045 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5046 ) 5047 if os.path.exists( 5048 os.path.join( 5049 databases_folders, assembly, variant_white_list_file 5050 ) 5051 ): 5052 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5053 5054 # transcript_source 5055 transcript_source = param_exomiser.get( 5056 "transcript_source", None 5057 ) # ucsc, refseq, ensembl 5058 if transcript_source: 5059 exomiser_options += ( 5060 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5061 ) 5062 5063 # If analysis contain proband param 5064 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5065 "proband", {} 5066 ): 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5068 5069 # If no proband (usually uniq sample) 5070 else: 5071 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5072 5073 # Log 5074 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5075 5076 # Run command 5077 result = subprocess.call( 5078 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5079 ) 5080 if result: 5081 log.error("Exomiser command failed") 5082 raise ValueError("Exomiser command failed") 5083 5084 ### RESULTS ### 5085 ############### 5086 5087 ### Annotate with TSV fields ### 5088 5089 # Init result tsv file 5090 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5091 5092 # Init result tsv file 5093 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5094 5095 # Parse TSV file and explode columns in INFO field 5096 if exomiser_to_info and os.path.exists(output_results_tsv): 5097 5098 # Log 5099 log.debug("Exomiser columns to VCF INFO field") 5100 5101 # Retrieve columns and types 5102 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5103 output_results_tsv_df = self.get_query_to_df(query) 5104 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5105 5106 # Init concat fields for update 5107 sql_query_update_concat_fields = [] 5108 5109 # Fields to avoid 5110 fields_to_avoid = [ 5111 "CONTIG", 5112 "START", 5113 "END", 5114 "REF", 5115 "ALT", 5116 "QUAL", 5117 "FILTER", 5118 "GENOTYPE", 5119 ] 5120 5121 # List all columns to add into header 5122 for header_column in output_results_tsv_columns: 5123 5124 # If header column is enable 5125 if header_column not in fields_to_avoid: 5126 5127 # Header info type 5128 header_info_type = "String" 5129 header_column_df = output_results_tsv_df[header_column] 5130 header_column_df_dtype = header_column_df.dtype 5131 if header_column_df_dtype == object: 5132 if ( 5133 pd.to_numeric(header_column_df, errors="coerce") 5134 .notnull() 5135 .all() 5136 ): 5137 header_info_type = "Float" 5138 else: 5139 header_info_type = "Integer" 5140 5141 # Header info 5142 characters_to_validate = ["-"] 5143 pattern = "[" + "".join(characters_to_validate) + "]" 5144 header_info_name = re.sub( 5145 pattern, 5146 "_", 5147 f"Exomiser_{header_column}".replace("#", ""), 5148 ) 5149 header_info_number = "." 5150 header_info_description = ( 5151 f"Exomiser {header_column} annotation" 5152 ) 5153 header_info_source = "Exomiser" 5154 header_info_version = "unknown" 5155 header_info_code = CODE_TYPE_MAP[header_info_type] 5156 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5157 header_info_name, 5158 header_info_number, 5159 header_info_type, 5160 header_info_description, 5161 header_info_source, 5162 header_info_version, 5163 header_info_code, 5164 ) 5165 5166 # Add field to add for update to concat fields 5167 sql_query_update_concat_fields.append( 5168 f""" 5169 CASE 5170 WHEN table_parquet."{header_column}" NOT IN ('','.') 5171 THEN concat( 5172 '{header_info_name}=', 5173 table_parquet."{header_column}", 5174 ';' 5175 ) 5176 5177 ELSE '' 5178 END 5179 """ 5180 ) 5181 5182 # Update query 5183 sql_query_update = f""" 5184 UPDATE {table_variants} as table_variants 5185 SET INFO = concat( 5186 CASE 5187 WHEN INFO NOT IN ('', '.') 5188 THEN INFO 5189 ELSE '' 5190 END, 5191 CASE 5192 WHEN table_variants.INFO NOT IN ('','.') 5193 THEN ';' 5194 ELSE '' 5195 END, 5196 ( 5197 SELECT 5198 concat( 5199 {",".join(sql_query_update_concat_fields)} 5200 ) 5201 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5202 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5203 AND table_parquet.\"START\" = table_variants.\"POS\" 5204 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5205 AND table_parquet.\"REF\" = table_variants.\"REF\" 5206 ) 5207 ) 5208 ; 5209 """ 5210 5211 # Update 5212 self.conn.execute(sql_query_update) 5213 5214 ### Annotate with VCF INFO field ### 5215 5216 # Init result VCF file 5217 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5218 5219 # If VCF exists 5220 if os.path.exists(output_results_vcf): 5221 5222 # Log 5223 log.debug("Exomiser result VCF update variants") 5224 5225 # Find Exomiser INFO field annotation in header 5226 with gzip.open(output_results_vcf, "rt") as f: 5227 header_list = self.read_vcf_header(f) 5228 exomiser_vcf_header = vcf.Reader( 5229 io.StringIO("\n".join(header_list)) 5230 ) 5231 5232 # Add annotation INFO field to header 5233 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5234 5235 # Update variants with VCF 5236 self.update_from_vcf(output_results_vcf) 5237 5238 return True 5239 5240 def annotation_snpeff(self, threads: int = None) -> None: 5241 """ 5242 This function annotate with snpEff 5243 5244 :param threads: The number of threads to use 5245 :return: the value of the variable "return_value". 5246 """ 5247 5248 # DEBUG 5249 log.debug("Start annotation with snpeff databases") 5250 5251 # Threads 5252 if not threads: 5253 threads = self.get_threads() 5254 log.debug("Threads: " + str(threads)) 5255 5256 # DEBUG 5257 delete_tmp = True 5258 if self.get_config().get("verbosity", "warning") in ["debug"]: 5259 delete_tmp = False 5260 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5261 5262 # Config 5263 config = self.get_config() 5264 log.debug("Config: " + str(config)) 5265 5266 # Config - Folders - Databases 5267 databases_folders = ( 5268 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5269 ) 5270 log.debug("Databases annotations: " + str(databases_folders)) 5271 5272 # Config - snpEff bin command 5273 snpeff_bin_command = get_bin_command( 5274 bin="snpEff.jar", 5275 tool="snpeff", 5276 bin_type="jar", 5277 config=config, 5278 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5279 ) 5280 if not snpeff_bin_command: 5281 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5282 log.error(msg_err) 5283 raise ValueError(msg_err) 5284 5285 # Config - snpEff databases 5286 snpeff_databases = ( 5287 config.get("folders", {}) 5288 .get("databases", {}) 5289 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5290 ) 5291 snpeff_databases = full_path(snpeff_databases) 5292 if snpeff_databases is not None and snpeff_databases != "": 5293 log.debug(f"Create snpEff databases folder") 5294 if not os.path.exists(snpeff_databases): 5295 os.makedirs(snpeff_databases) 5296 5297 # Param 5298 param = self.get_param() 5299 log.debug("Param: " + str(param)) 5300 5301 # Param 5302 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5303 log.debug("Options: " + str(options)) 5304 5305 # Param - Assembly 5306 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5307 5308 # Param - Options 5309 snpeff_options = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5311 ) 5312 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5313 snpeff_csvstats = ( 5314 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5315 ) 5316 if snpeff_stats: 5317 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5318 snpeff_stats = full_path(snpeff_stats) 5319 snpeff_options += f" -stats {snpeff_stats}" 5320 if snpeff_csvstats: 5321 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5322 snpeff_csvstats = full_path(snpeff_csvstats) 5323 snpeff_options += f" -csvStats {snpeff_csvstats}" 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes = ( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5332 ) 5333 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5334 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # Export in VCF 5339 log.debug("Create initial file to annotate") 5340 tmp_vcf = NamedTemporaryFile( 5341 prefix=self.get_prefix(), 5342 dir=self.get_tmp_dir(), 5343 suffix=".vcf.gz", 5344 delete=True, 5345 ) 5346 tmp_vcf_name = tmp_vcf.name 5347 5348 # VCF header 5349 vcf_reader = self.get_header() 5350 log.debug("Initial header: " + str(vcf_reader.infos)) 5351 5352 # Existing annotations 5353 for vcf_annotation in self.get_header().infos: 5354 5355 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5356 log.debug( 5357 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5358 ) 5359 5360 # Memory limit 5361 # if config.get("memory", None): 5362 # memory_limit = config.get("memory", "8G") 5363 # else: 5364 # memory_limit = "8G" 5365 memory_limit = self.get_memory("8G") 5366 log.debug(f"memory_limit: {memory_limit}") 5367 5368 # snpEff java options 5369 snpeff_java_options = ( 5370 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5371 ) 5372 log.debug(f"Exomiser java options: {snpeff_java_options}") 5373 5374 force_update_annotation = True 5375 5376 if "ANN" not in self.get_header().infos or force_update_annotation: 5377 5378 # Check snpEff database 5379 log.debug(f"Check snpEff databases {[assembly]}") 5380 databases_download_snpeff( 5381 folder=snpeff_databases, assemblies=[assembly], config=config 5382 ) 5383 5384 # Export VCF file 5385 self.export_variant_vcf( 5386 vcf_file=tmp_vcf_name, 5387 remove_info=True, 5388 add_samples=False, 5389 index=True, 5390 ) 5391 5392 # Tmp file 5393 err_files = [] 5394 tmp_annotate_vcf = NamedTemporaryFile( 5395 prefix=self.get_prefix(), 5396 dir=self.get_tmp_dir(), 5397 suffix=".vcf", 5398 delete=False, 5399 ) 5400 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5401 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5402 err_files.append(tmp_annotate_vcf_name_err) 5403 5404 # Command 5405 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5406 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5407 run_parallel_commands([snpeff_command], 1) 5408 5409 # Error messages 5410 log.info(f"Error/Warning messages:") 5411 error_message_command_all = [] 5412 error_message_command_warning = [] 5413 error_message_command_err = [] 5414 for err_file in err_files: 5415 with open(err_file, "r") as f: 5416 for line in f: 5417 message = line.strip() 5418 error_message_command_all.append(message) 5419 if line.startswith("[W::"): 5420 error_message_command_warning.append(message) 5421 if line.startswith("[E::"): 5422 error_message_command_err.append(f"{err_file}: " + message) 5423 # log info 5424 for message in list( 5425 set(error_message_command_err + error_message_command_warning) 5426 ): 5427 log.info(f" {message}") 5428 # debug info 5429 for message in list(set(error_message_command_all)): 5430 log.debug(f" {message}") 5431 # failed 5432 if len(error_message_command_err): 5433 log.error("Annotation failed: Error in commands") 5434 raise ValueError("Annotation failed: Error in commands") 5435 5436 # Find annotation in header 5437 with open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 else: 5450 if "ANN" in self.get_header().infos: 5451 log.debug(f"Existing snpEff annotations in VCF") 5452 if force_update_annotation: 5453 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5454 5455 def annotation_annovar(self, threads: int = None) -> None: 5456 """ 5457 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5458 annotations 5459 5460 :param threads: number of threads to use 5461 :return: the value of the variable "return_value". 5462 """ 5463 5464 # DEBUG 5465 log.debug("Start annotation with Annovar databases") 5466 5467 # Threads 5468 if not threads: 5469 threads = self.get_threads() 5470 log.debug("Threads: " + str(threads)) 5471 5472 # Tmp en Err files 5473 tmp_files = [] 5474 err_files = [] 5475 5476 # DEBUG 5477 delete_tmp = True 5478 if self.get_config().get("verbosity", "warning") in ["debug"]: 5479 delete_tmp = False 5480 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5481 5482 # Config 5483 config = self.get_config() 5484 log.debug("Config: " + str(config)) 5485 5486 # Config - Folders - Databases 5487 databases_folders = ( 5488 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5489 ) 5490 log.debug("Databases annotations: " + str(databases_folders)) 5491 5492 # Config - annovar bin command 5493 annovar_bin_command = get_bin_command( 5494 bin="table_annovar.pl", 5495 tool="annovar", 5496 bin_type="perl", 5497 config=config, 5498 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5499 ) 5500 if not annovar_bin_command: 5501 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5502 log.error(msg_err) 5503 raise ValueError(msg_err) 5504 5505 # Config - BCFTools bin command 5506 bcftools_bin_command = get_bin_command( 5507 bin="bcftools", 5508 tool="bcftools", 5509 bin_type="bin", 5510 config=config, 5511 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5512 ) 5513 if not bcftools_bin_command: 5514 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Config - annovar databases 5519 annovar_databases = ( 5520 config.get("folders", {}) 5521 .get("databases", {}) 5522 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5523 ) 5524 if annovar_databases is not None: 5525 if isinstance(annovar_databases, list): 5526 annovar_databases = full_path(annovar_databases[0]) 5527 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5528 annovar_databases = full_path(annovar_databases) 5529 if not os.path.exists(annovar_databases): 5530 log.info(f"Annovar databases folder '{annovar_databases}' created") 5531 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5532 else: 5533 msg_err = f"Annovar databases configuration failed" 5534 log.error(msg_err) 5535 raise ValueError(msg_err) 5536 5537 # Param 5538 param = self.get_param() 5539 log.debug("Param: " + str(param)) 5540 5541 # Param - options 5542 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5543 log.debug("Options: " + str(options)) 5544 5545 # Param - annotations 5546 annotations = ( 5547 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5548 ) 5549 log.debug("Annotations: " + str(annotations)) 5550 5551 # Param - Assembly 5552 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5553 5554 # Annovar database assembly 5555 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5556 if annovar_databases_assembly != "" and not os.path.exists( 5557 annovar_databases_assembly 5558 ): 5559 os.makedirs(annovar_databases_assembly) 5560 5561 # Data 5562 table_variants = self.get_table_variants() 5563 5564 # Check if not empty 5565 log.debug("Check if not empty") 5566 sql_query_chromosomes = ( 5567 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5568 ) 5569 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5570 if not sql_query_chromosomes_df["count"][0]: 5571 log.info(f"VCF empty") 5572 return 5573 5574 # VCF header 5575 vcf_reader = self.get_header() 5576 log.debug("Initial header: " + str(vcf_reader.infos)) 5577 5578 # Existing annotations 5579 for vcf_annotation in self.get_header().infos: 5580 5581 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5582 log.debug( 5583 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5584 ) 5585 5586 force_update_annotation = True 5587 5588 if annotations: 5589 5590 commands = [] 5591 tmp_annotates_vcf_name_list = [] 5592 5593 # Export in VCF 5594 log.debug("Create initial file to annotate") 5595 tmp_vcf = NamedTemporaryFile( 5596 prefix=self.get_prefix(), 5597 dir=self.get_tmp_dir(), 5598 suffix=".vcf.gz", 5599 delete=False, 5600 ) 5601 tmp_vcf_name = tmp_vcf.name 5602 tmp_files.append(tmp_vcf_name) 5603 tmp_files.append(tmp_vcf_name + ".tbi") 5604 5605 # Export VCF file 5606 self.export_variant_vcf( 5607 vcf_file=tmp_vcf_name, 5608 remove_info=".", 5609 add_samples=False, 5610 index=True, 5611 ) 5612 5613 # Create file for field rename 5614 log.debug("Create file for field rename") 5615 tmp_rename = NamedTemporaryFile( 5616 prefix=self.get_prefix(), 5617 dir=self.get_tmp_dir(), 5618 suffix=".rename", 5619 delete=False, 5620 ) 5621 tmp_rename_name = tmp_rename.name 5622 tmp_files.append(tmp_rename_name) 5623 5624 # Check Annovar database 5625 log.debug( 5626 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5627 ) 5628 databases_download_annovar( 5629 folder=annovar_databases, 5630 files=list(annotations.keys()), 5631 assemblies=[assembly], 5632 ) 5633 5634 for annotation in annotations: 5635 annotation_fields = annotations[annotation] 5636 5637 if not annotation_fields: 5638 annotation_fields = {"INFO": None} 5639 5640 log.info(f"Annotations Annovar - database '{annotation}'") 5641 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5642 5643 # Tmp file for annovar 5644 err_files = [] 5645 tmp_annotate_vcf_directory = TemporaryDirectory( 5646 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5647 ) 5648 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5649 tmp_annotate_vcf_name_annovar = ( 5650 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5651 ) 5652 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5653 err_files.append(tmp_annotate_vcf_name_err) 5654 tmp_files.append(tmp_annotate_vcf_name_err) 5655 5656 # Tmp file final vcf annotated by annovar 5657 tmp_annotate_vcf = NamedTemporaryFile( 5658 prefix=self.get_prefix(), 5659 dir=self.get_tmp_dir(), 5660 suffix=".vcf.gz", 5661 delete=False, 5662 ) 5663 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5664 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name) 5666 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5667 5668 # Number of fields 5669 annotation_list = [] 5670 annotation_renamed_list = [] 5671 5672 for annotation_field in annotation_fields: 5673 5674 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5675 annotation_fields_new_name = annotation_fields.get( 5676 annotation_field, annotation_field 5677 ) 5678 if not annotation_fields_new_name: 5679 annotation_fields_new_name = annotation_field 5680 5681 if ( 5682 force_update_annotation 5683 or annotation_fields_new_name not in self.get_header().infos 5684 ): 5685 annotation_list.append(annotation_field) 5686 annotation_renamed_list.append(annotation_fields_new_name) 5687 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5688 log.warning( 5689 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5690 ) 5691 5692 # Add rename info 5693 run_parallel_commands( 5694 [ 5695 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5696 ], 5697 1, 5698 ) 5699 5700 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5701 log.debug("annotation_list: " + str(annotation_list)) 5702 5703 # protocol 5704 protocol = annotation 5705 5706 # argument 5707 argument = "" 5708 5709 # operation 5710 operation = "f" 5711 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5712 "ensGene" 5713 ): 5714 operation = "g" 5715 if options.get("genebase", None): 5716 argument = f"""'{options.get("genebase","")}'""" 5717 elif annotation in ["cytoBand"]: 5718 operation = "r" 5719 5720 # argument option 5721 argument_option = "" 5722 if argument != "": 5723 argument_option = " --argument " + argument 5724 5725 # command options 5726 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5727 for option in options: 5728 if option not in ["genebase"]: 5729 command_options += f""" --{option}={options[option]}""" 5730 5731 # Command 5732 5733 # Command - Annovar 5734 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5735 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5736 5737 # Command - start pipe 5738 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5741 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5742 5743 # Command - Special characters (refGene annotation) 5744 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5745 5746 # Command - Clean empty fields (with value ".") 5747 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5748 5749 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5750 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5751 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5752 # for ann in annotation_renamed_list: 5753 for ann in annotation_list: 5754 annovar_fields_to_keep.append(f"^INFO/{ann}") 5755 5756 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5757 5758 # Command - indexing 5759 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5760 5761 log.debug(f"Annotation - Annovar command: {command_annovar}") 5762 run_parallel_commands([command_annovar], 1) 5763 5764 # Error messages 5765 log.info(f"Error/Warning messages:") 5766 error_message_command_all = [] 5767 error_message_command_warning = [] 5768 error_message_command_err = [] 5769 for err_file in err_files: 5770 with open(err_file, "r") as f: 5771 for line in f: 5772 message = line.strip() 5773 error_message_command_all.append(message) 5774 if line.startswith("[W::") or line.startswith("WARNING"): 5775 error_message_command_warning.append(message) 5776 if line.startswith("[E::") or line.startswith("ERROR"): 5777 error_message_command_err.append( 5778 f"{err_file}: " + message 5779 ) 5780 # log info 5781 for message in list( 5782 set(error_message_command_err + error_message_command_warning) 5783 ): 5784 log.info(f" {message}") 5785 # debug info 5786 for message in list(set(error_message_command_all)): 5787 log.debug(f" {message}") 5788 # failed 5789 if len(error_message_command_err): 5790 log.error("Annotation failed: Error in commands") 5791 raise ValueError("Annotation failed: Error in commands") 5792 5793 if tmp_annotates_vcf_name_list: 5794 5795 # List of annotated files 5796 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5797 5798 # Tmp file 5799 tmp_annotate_vcf = NamedTemporaryFile( 5800 prefix=self.get_prefix(), 5801 dir=self.get_tmp_dir(), 5802 suffix=".vcf.gz", 5803 delete=False, 5804 ) 5805 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5806 tmp_files.append(tmp_annotate_vcf_name) 5807 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5808 err_files.append(tmp_annotate_vcf_name_err) 5809 tmp_files.append(tmp_annotate_vcf_name_err) 5810 5811 # Command merge 5812 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5813 log.info( 5814 f"Annotation Annovar - Annotation merging " 5815 + str(len(tmp_annotates_vcf_name_list)) 5816 + " annotated files" 5817 ) 5818 log.debug(f"Annotation - merge command: {merge_command}") 5819 run_parallel_commands([merge_command], 1) 5820 5821 # Find annotation in header 5822 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5823 header_list = self.read_vcf_header(f) 5824 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5825 5826 for ann in annovar_vcf_header.infos: 5827 if ann not in self.get_header().infos: 5828 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5829 5830 # Update variants 5831 log.info(f"Annotation Annovar - Updating...") 5832 self.update_from_vcf(tmp_annotate_vcf_name) 5833 5834 # Clean files 5835 # Tmp file remove command 5836 if True: 5837 tmp_files_remove_command = "" 5838 if tmp_files: 5839 tmp_files_remove_command = " ".join(tmp_files) 5840 clean_command = f" rm -f {tmp_files_remove_command} " 5841 log.debug(f"Annotation Annovar - Annotation cleaning ") 5842 log.debug(f"Annotation - cleaning command: {clean_command}") 5843 run_parallel_commands([clean_command], 1) 5844 5845 # Parquet 5846 def annotation_parquet(self, threads: int = None) -> None: 5847 """ 5848 It takes a VCF file, and annotates it with a parquet file 5849 5850 :param threads: number of threads to use for the annotation 5851 :return: the value of the variable "result". 5852 """ 5853 5854 # DEBUG 5855 log.debug("Start annotation with parquet databases") 5856 5857 # Threads 5858 if not threads: 5859 threads = self.get_threads() 5860 log.debug("Threads: " + str(threads)) 5861 5862 # DEBUG 5863 delete_tmp = True 5864 if self.get_config().get("verbosity", "warning") in ["debug"]: 5865 delete_tmp = False 5866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5867 5868 # Config 5869 databases_folders = set( 5870 self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("annotations", ["."]) 5874 + self.get_config() 5875 .get("folders", {}) 5876 .get("databases", {}) 5877 .get("parquet", ["."]) 5878 ) 5879 log.debug("Databases annotations: " + str(databases_folders)) 5880 5881 # Param 5882 annotations = ( 5883 self.get_param() 5884 .get("annotation", {}) 5885 .get("parquet", {}) 5886 .get("annotations", None) 5887 ) 5888 log.debug("Annotations: " + str(annotations)) 5889 5890 # Assembly 5891 assembly = self.get_param().get( 5892 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5893 ) 5894 5895 # Force Update Annotation 5896 force_update_annotation = ( 5897 self.get_param() 5898 .get("annotation", {}) 5899 .get("options", {}) 5900 .get("annotations_update", False) 5901 ) 5902 log.debug(f"force_update_annotation={force_update_annotation}") 5903 force_append_annotation = ( 5904 self.get_param() 5905 .get("annotation", {}) 5906 .get("options", {}) 5907 .get("annotations_append", False) 5908 ) 5909 log.debug(f"force_append_annotation={force_append_annotation}") 5910 5911 # Data 5912 table_variants = self.get_table_variants() 5913 5914 # Check if not empty 5915 log.debug("Check if not empty") 5916 sql_query_chromosomes_df = self.get_query_to_df( 5917 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5918 ) 5919 if not sql_query_chromosomes_df["count"][0]: 5920 log.info(f"VCF empty") 5921 return 5922 5923 # VCF header 5924 vcf_reader = self.get_header() 5925 log.debug("Initial header: " + str(vcf_reader.infos)) 5926 5927 # Nb Variants POS 5928 log.debug("NB Variants Start") 5929 nb_variants = self.conn.execute( 5930 f"SELECT count(*) AS count FROM variants" 5931 ).fetchdf()["count"][0] 5932 log.debug("NB Variants Stop") 5933 5934 # Existing annotations 5935 for vcf_annotation in self.get_header().infos: 5936 5937 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5938 log.debug( 5939 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5940 ) 5941 5942 # Added columns 5943 added_columns = [] 5944 5945 # drop indexes 5946 log.debug(f"Drop indexes...") 5947 self.drop_indexes() 5948 5949 if annotations: 5950 5951 if "ALL" in annotations: 5952 5953 all_param = annotations.get("ALL", {}) 5954 all_param_formats = all_param.get("formats", None) 5955 all_param_releases = all_param.get("releases", None) 5956 5957 databases_infos_dict = self.scan_databases( 5958 database_formats=all_param_formats, 5959 database_releases=all_param_releases, 5960 ) 5961 for database_infos in databases_infos_dict.keys(): 5962 if database_infos not in annotations: 5963 annotations[database_infos] = {"INFO": None} 5964 5965 for annotation in annotations: 5966 5967 if annotation in ["ALL"]: 5968 continue 5969 5970 # Annotation Name 5971 annotation_name = os.path.basename(annotation) 5972 5973 # Annotation fields 5974 annotation_fields = annotations[annotation] 5975 if not annotation_fields: 5976 annotation_fields = {"INFO": None} 5977 5978 log.debug(f"Annotation '{annotation_name}'") 5979 log.debug( 5980 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5981 ) 5982 5983 # Create Database 5984 database = Database( 5985 database=annotation, 5986 databases_folders=databases_folders, 5987 assembly=assembly, 5988 ) 5989 5990 # Find files 5991 parquet_file = database.get_database() 5992 parquet_hdr_file = database.get_header_file() 5993 parquet_type = database.get_type() 5994 5995 # Check if files exists 5996 if not parquet_file or not parquet_hdr_file: 5997 msg_err_list = [] 5998 if not parquet_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file not found" 6001 ) 6002 if parquet_file and not parquet_hdr_file: 6003 msg_err_list.append( 6004 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6005 ) 6006 6007 log.error(". ".join(msg_err_list)) 6008 raise ValueError(". ".join(msg_err_list)) 6009 else: 6010 # Get parquet connexion 6011 parquet_sql_attach = database.get_sql_database_attach( 6012 output="query" 6013 ) 6014 if parquet_sql_attach: 6015 self.conn.execute(parquet_sql_attach) 6016 parquet_file_link = database.get_sql_database_link() 6017 # Log 6018 log.debug( 6019 f"Annotation '{annotation_name}' - file: " 6020 + str(parquet_file) 6021 + " and " 6022 + str(parquet_hdr_file) 6023 ) 6024 6025 # Database full header columns 6026 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6027 parquet_hdr_file 6028 ) 6029 # Log 6030 log.debug( 6031 "Annotation database header columns : " 6032 + str(parquet_hdr_vcf_header_columns) 6033 ) 6034 6035 # Load header as VCF object 6036 parquet_hdr_vcf_header_infos = database.get_header().infos 6037 # Log 6038 log.debug( 6039 "Annotation database header: " 6040 + str(parquet_hdr_vcf_header_infos) 6041 ) 6042 6043 # Get extra infos 6044 parquet_columns = database.get_extra_columns() 6045 # Log 6046 log.debug("Annotation database Columns: " + str(parquet_columns)) 6047 6048 # Add extra columns if "ALL" in annotation_fields 6049 # if "ALL" in annotation_fields: 6050 # allow_add_extra_column = True 6051 if "ALL" in annotation_fields and database.get_extra_columns(): 6052 for extra_column in database.get_extra_columns(): 6053 if ( 6054 extra_column not in annotation_fields 6055 and extra_column.replace("INFO/", "") 6056 not in parquet_hdr_vcf_header_infos 6057 ): 6058 parquet_hdr_vcf_header_infos[extra_column] = ( 6059 vcf.parser._Info( 6060 extra_column, 6061 ".", 6062 "String", 6063 f"{extra_column} description", 6064 "unknown", 6065 "unknown", 6066 self.code_type_map["String"], 6067 ) 6068 ) 6069 6070 # For all fields in database 6071 annotation_fields_all = False 6072 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6073 annotation_fields_all = True 6074 annotation_fields = { 6075 key: key for key in parquet_hdr_vcf_header_infos 6076 } 6077 6078 log.debug( 6079 "Annotation database header - All annotations added: " 6080 + str(annotation_fields) 6081 ) 6082 6083 # Init 6084 6085 # List of annotation fields to use 6086 sql_query_annotation_update_info_sets = [] 6087 6088 # List of annotation to agregate 6089 sql_query_annotation_to_agregate = [] 6090 6091 # Number of fields 6092 nb_annotation_field = 0 6093 6094 # Annotation fields processed 6095 annotation_fields_processed = [] 6096 6097 # Columns mapping 6098 map_columns = database.map_columns( 6099 columns=annotation_fields, prefixes=["INFO/"] 6100 ) 6101 6102 # Query dict for fields to remove (update option) 6103 query_dict_remove = {} 6104 6105 # Fetch Anotation fields 6106 for annotation_field in annotation_fields: 6107 6108 # annotation_field_column 6109 annotation_field_column = map_columns.get( 6110 annotation_field, "INFO" 6111 ) 6112 6113 # field new name, if parametered 6114 annotation_fields_new_name = annotation_fields.get( 6115 annotation_field, annotation_field 6116 ) 6117 if not annotation_fields_new_name: 6118 annotation_fields_new_name = annotation_field 6119 6120 # To annotate 6121 # force_update_annotation = True 6122 # force_append_annotation = True 6123 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6124 if annotation_field in parquet_hdr_vcf_header_infos and ( 6125 force_update_annotation 6126 or force_append_annotation 6127 or ( 6128 annotation_fields_new_name 6129 not in self.get_header().infos 6130 ) 6131 ): 6132 6133 # Add field to annotation to process list 6134 annotation_fields_processed.append( 6135 annotation_fields_new_name 6136 ) 6137 6138 # explode infos for the field 6139 annotation_fields_new_name_info_msg = "" 6140 if ( 6141 force_update_annotation 6142 and annotation_fields_new_name 6143 in self.get_header().infos 6144 ): 6145 # Remove field from INFO 6146 query = f""" 6147 UPDATE {table_variants} as table_variants 6148 SET INFO = REGEXP_REPLACE( 6149 concat(table_variants.INFO,''), 6150 ';*{annotation_fields_new_name}=[^;]*', 6151 '' 6152 ) 6153 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6154 """ 6155 annotation_fields_new_name_info_msg = " [update]" 6156 query_dict_remove[ 6157 f"remove 'INFO/{annotation_fields_new_name}'" 6158 ] = query 6159 6160 # Sep between fields in INFO 6161 nb_annotation_field += 1 6162 if nb_annotation_field > 1: 6163 annotation_field_sep = ";" 6164 else: 6165 annotation_field_sep = "" 6166 6167 log.info( 6168 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6169 ) 6170 6171 # Add INFO field to header 6172 parquet_hdr_vcf_header_infos_number = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].num 6174 or "." 6175 ) 6176 parquet_hdr_vcf_header_infos_type = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].type 6178 or "String" 6179 ) 6180 parquet_hdr_vcf_header_infos_description = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].desc 6182 or f"{annotation_field} description" 6183 ) 6184 parquet_hdr_vcf_header_infos_source = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].source 6186 or "unknown" 6187 ) 6188 parquet_hdr_vcf_header_infos_version = ( 6189 parquet_hdr_vcf_header_infos[annotation_field].version 6190 or "unknown" 6191 ) 6192 6193 vcf_reader.infos[annotation_fields_new_name] = ( 6194 vcf.parser._Info( 6195 annotation_fields_new_name, 6196 parquet_hdr_vcf_header_infos_number, 6197 parquet_hdr_vcf_header_infos_type, 6198 parquet_hdr_vcf_header_infos_description, 6199 parquet_hdr_vcf_header_infos_source, 6200 parquet_hdr_vcf_header_infos_version, 6201 self.code_type_map[ 6202 parquet_hdr_vcf_header_infos_type 6203 ], 6204 ) 6205 ) 6206 6207 # Append 6208 if force_append_annotation: 6209 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6210 else: 6211 query_case_when_append = "" 6212 6213 # Annotation/Update query fields 6214 # Found in INFO column 6215 if ( 6216 annotation_field_column == "INFO" 6217 and "INFO" in parquet_hdr_vcf_header_columns 6218 ): 6219 sql_query_annotation_update_info_sets.append( 6220 f""" 6221 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6222 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6223 ELSE '' 6224 END 6225 """ 6226 ) 6227 # Found in a specific column 6228 else: 6229 sql_query_annotation_update_info_sets.append( 6230 f""" 6231 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6232 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6233 ELSE '' 6234 END 6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate 6242 else: 6243 6244 if force_update_annotation: 6245 annotation_message = "forced" 6246 else: 6247 annotation_message = "skipped" 6248 6249 if annotation_field not in parquet_hdr_vcf_header_infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6252 ) 6253 if annotation_fields_new_name in self.get_header().infos: 6254 log.warning( 6255 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6256 ) 6257 6258 # Check if ALL fields have to be annotated. Thus concat all INFO field 6259 # allow_annotation_full_info = True 6260 allow_annotation_full_info = not force_append_annotation 6261 6262 if parquet_type in ["regions"]: 6263 allow_annotation_full_info = False 6264 6265 if ( 6266 allow_annotation_full_info 6267 and nb_annotation_field == len(annotation_fields) 6268 and annotation_fields_all 6269 and ( 6270 "INFO" in parquet_hdr_vcf_header_columns 6271 and "INFO" in database.get_extra_columns() 6272 ) 6273 ): 6274 log.debug("Column INFO annotation enabled") 6275 sql_query_annotation_update_info_sets = [] 6276 sql_query_annotation_update_info_sets.append( 6277 f" table_parquet.INFO " 6278 ) 6279 6280 if sql_query_annotation_update_info_sets: 6281 6282 # Annotate 6283 log.info(f"Annotation '{annotation_name}' - Annotation...") 6284 6285 # Join query annotation update info sets for SQL 6286 sql_query_annotation_update_info_sets_sql = ",".join( 6287 sql_query_annotation_update_info_sets 6288 ) 6289 6290 # Check chromosomes list (and variants infos) 6291 sql_query_chromosomes = f""" 6292 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6293 FROM {table_variants} as table_variants 6294 GROUP BY table_variants."#CHROM" 6295 ORDER BY table_variants."#CHROM" 6296 """ 6297 sql_query_chromosomes_df = self.conn.execute( 6298 sql_query_chromosomes 6299 ).df() 6300 sql_query_chromosomes_dict = { 6301 entry["CHROM"]: { 6302 "count": entry["count_variants"], 6303 "min": entry["min_variants"], 6304 "max": entry["max_variants"], 6305 } 6306 for index, entry in sql_query_chromosomes_df.iterrows() 6307 } 6308 6309 # Init 6310 nb_of_query = 0 6311 nb_of_variant_annotated = 0 6312 query_dict = query_dict_remove 6313 6314 # for chrom in sql_query_chromosomes_df["CHROM"]: 6315 for chrom in sql_query_chromosomes_dict: 6316 6317 # Number of variant by chromosome 6318 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6319 chrom, {} 6320 ).get("count", 0) 6321 6322 log.debug( 6323 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6324 ) 6325 6326 # Annotation with regions database 6327 if parquet_type in ["regions"]: 6328 sql_query_annotation_from_clause = f""" 6329 FROM ( 6330 SELECT 6331 '{chrom}' AS \"#CHROM\", 6332 table_variants_from.\"POS\" AS \"POS\", 6333 {",".join(sql_query_annotation_to_agregate)} 6334 FROM {table_variants} as table_variants_from 6335 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6336 table_parquet_from."#CHROM" = '{chrom}' 6337 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6338 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6339 ) 6340 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6341 GROUP BY table_variants_from.\"POS\" 6342 ) 6343 as table_parquet 6344 """ 6345 6346 sql_query_annotation_where_clause = """ 6347 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6348 AND table_parquet.\"POS\" = table_variants.\"POS\" 6349 """ 6350 6351 # Annotation with variants database 6352 else: 6353 sql_query_annotation_from_clause = f""" 6354 FROM {parquet_file_link} as table_parquet 6355 """ 6356 sql_query_annotation_where_clause = f""" 6357 table_variants."#CHROM" = '{chrom}' 6358 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6359 AND table_parquet.\"POS\" = table_variants.\"POS\" 6360 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6361 AND table_parquet.\"REF\" = table_variants.\"REF\" 6362 """ 6363 6364 # Create update query 6365 sql_query_annotation_chrom_interval_pos = f""" 6366 UPDATE {table_variants} as table_variants 6367 SET INFO = 6368 concat( 6369 CASE WHEN table_variants.INFO NOT IN ('','.') 6370 THEN table_variants.INFO 6371 ELSE '' 6372 END 6373 , 6374 CASE WHEN table_variants.INFO NOT IN ('','.') 6375 AND ( 6376 concat({sql_query_annotation_update_info_sets_sql}) 6377 ) 6378 NOT IN ('','.') 6379 THEN ';' 6380 ELSE '' 6381 END 6382 , 6383 {sql_query_annotation_update_info_sets_sql} 6384 ) 6385 {sql_query_annotation_from_clause} 6386 WHERE {sql_query_annotation_where_clause} 6387 ; 6388 """ 6389 6390 # Add update query to dict 6391 query_dict[ 6392 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6393 ] = sql_query_annotation_chrom_interval_pos 6394 6395 nb_of_query = len(query_dict) 6396 num_query = 0 6397 6398 # SET max_expression_depth TO x 6399 self.conn.execute("SET max_expression_depth TO 10000") 6400 6401 for query_name in query_dict: 6402 query = query_dict[query_name] 6403 num_query += 1 6404 log.info( 6405 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6406 ) 6407 result = self.conn.execute(query) 6408 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6409 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6412 ) 6413 6414 log.info( 6415 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6416 ) 6417 6418 else: 6419 6420 log.info( 6421 f"Annotation '{annotation_name}' - No Annotations available" 6422 ) 6423 6424 log.debug("Final header: " + str(vcf_reader.infos)) 6425 6426 # Remove added columns 6427 for added_column in added_columns: 6428 self.drop_column(column=added_column) 6429 6430 def annotation_splice(self, threads: int = None) -> None: 6431 """ 6432 This function annotate with snpEff 6433 6434 :param threads: The number of threads to use 6435 :return: the value of the variable "return_value". 6436 """ 6437 6438 # DEBUG 6439 log.debug("Start annotation with splice tools") 6440 6441 # Threads 6442 if not threads: 6443 threads = self.get_threads() 6444 log.debug("Threads: " + str(threads)) 6445 6446 # DEBUG 6447 delete_tmp = True 6448 if self.get_config().get("verbosity", "warning") in ["debug"]: 6449 delete_tmp = False 6450 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6451 6452 # Config 6453 config = self.get_config() 6454 log.debug("Config: " + str(config)) 6455 splice_config = config.get("tools", {}).get("splice", {}) 6456 if not splice_config: 6457 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6458 msg_err = "No Splice tool config" 6459 raise ValueError(msg_err) 6460 log.debug(f"splice_config: {splice_config}") 6461 6462 # Config - Folders - Databases 6463 databases_folders = ( 6464 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6465 ) 6466 log.debug("Databases annotations: " + str(databases_folders)) 6467 6468 # Splice docker image 6469 splice_docker_image = splice_config.get("docker").get("image") 6470 6471 # Pull splice image if it's not already there 6472 if not check_docker_image_exists(splice_docker_image): 6473 log.warning( 6474 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6475 ) 6476 try: 6477 command(f"docker pull {splice_config.get('docker').get('image')}") 6478 except subprocess.CalledProcessError: 6479 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6480 log.error(msg_err) 6481 raise ValueError(msg_err) 6482 6483 # Config - splice databases 6484 splice_databases = ( 6485 config.get("folders", {}) 6486 .get("databases", {}) 6487 .get("splice", DEFAULT_SPLICE_FOLDER) 6488 ) 6489 splice_databases = full_path(splice_databases) 6490 6491 # Param 6492 param = self.get_param() 6493 log.debug("Param: " + str(param)) 6494 6495 # Param 6496 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6497 log.debug("Options: " + str(options)) 6498 6499 # Data 6500 table_variants = self.get_table_variants() 6501 6502 # Check if not empty 6503 log.debug("Check if not empty") 6504 sql_query_chromosomes = ( 6505 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6506 ) 6507 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6508 log.info("VCF empty") 6509 return None 6510 6511 # Export in VCF 6512 log.debug("Create initial file to annotate") 6513 6514 # Create output folder / work folder 6515 if options.get("output_folder", ""): 6516 output_folder = options.get("output_folder", "") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 else: 6520 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6521 if not os.path.exists(output_folder): 6522 Path(output_folder).mkdir(parents=True, exist_ok=True) 6523 6524 if options.get("workdir", ""): 6525 workdir = options.get("workdir", "") 6526 else: 6527 workdir = "/work" 6528 6529 # Create tmp VCF file 6530 tmp_vcf = NamedTemporaryFile( 6531 prefix=self.get_prefix(), 6532 dir=output_folder, 6533 suffix=".vcf", 6534 delete=False, 6535 ) 6536 tmp_vcf_name = tmp_vcf.name 6537 6538 # VCF header 6539 header = self.get_header() 6540 6541 # Existing annotations 6542 for vcf_annotation in self.get_header().infos: 6543 6544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6545 log.debug( 6546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6547 ) 6548 6549 # Memory limit 6550 if config.get("memory", None): 6551 memory_limit = config.get("memory", "8G").upper() 6552 # upper() 6553 else: 6554 memory_limit = "8G" 6555 log.debug(f"memory_limit: {memory_limit}") 6556 6557 # Check number of variants to annotate 6558 where_clause_regex_spliceai = r"SpliceAI_\w+" 6559 where_clause_regex_spip = r"SPiP_\w+" 6560 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6561 df_list_of_variants_to_annotate = self.get_query_to_df( 6562 query=f""" SELECT * FROM variants {where_clause} """ 6563 ) 6564 if len(df_list_of_variants_to_annotate) == 0: 6565 log.warning( 6566 f"No variants to annotate with splice. Variants probably already annotated with splice" 6567 ) 6568 return None 6569 else: 6570 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6571 6572 # Export VCF file 6573 self.export_variant_vcf( 6574 vcf_file=tmp_vcf_name, 6575 remove_info=True, 6576 add_samples=True, 6577 index=False, 6578 where_clause=where_clause, 6579 ) 6580 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6581 if any(value for value in splice_config.values() if value is None): 6582 log.warning("At least one splice config parameter is empty") 6583 # exit annotation_splice 6584 return None 6585 6586 # Params in splice nf 6587 def check_values(dico: dict): 6588 """ 6589 Ensure parameters for NF splice pipeline 6590 """ 6591 for key, val in dico.items(): 6592 if key == "genome": 6593 if any( 6594 assemb in options.get("genome", {}) 6595 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6596 ): 6597 yield f"--{key} hg19" 6598 elif any( 6599 assemb in options.get("genome", {}) 6600 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6601 ): 6602 yield f"--{key} hg38" 6603 elif ( 6604 (isinstance(val, str) and val) 6605 or isinstance(val, int) 6606 or isinstance(val, bool) 6607 ): 6608 yield f"--{key} {val}" 6609 6610 # Genome 6611 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6612 options["genome"] = genome 6613 # NF params 6614 nf_params = [] 6615 # Add options 6616 if options: 6617 log.debug(options) 6618 nf_params = list(check_values(options)) 6619 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6620 else: 6621 log.debug("No NF params provided") 6622 # Add threads 6623 if "threads" not in options.keys(): 6624 nf_params.append(f"--threads {threads}") 6625 # Genome path 6626 genome_path = find_genome( 6627 config.get("folders", {}) 6628 .get("databases", {}) 6629 .get("genomes", DEFAULT_GENOME_FOLDER), 6630 file=f"{genome}.fa", 6631 ) 6632 # Add genome path 6633 if not genome_path: 6634 raise ValueError( 6635 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6636 ) 6637 else: 6638 log.debug(f"Genome: {genome_path}") 6639 nf_params.append(f"--genome_path {genome_path}") 6640 6641 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6642 """ 6643 Setting up updated databases for SPiP and SpliceAI 6644 """ 6645 6646 try: 6647 6648 # SpliceAI assembly transcriptome 6649 spliceai_assembly = os.path.join( 6650 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6651 options.get("genome"), 6652 "transcriptome", 6653 ) 6654 spip_assembly = options.get("genome") 6655 6656 spip = find( 6657 f"transcriptome_{spip_assembly}.RData", 6658 config.get("folders", {}).get("databases", {}).get("spip", {}), 6659 ) 6660 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6661 log.debug(f"SPiP annotations: {spip}") 6662 log.debug(f"SpliceAI annotations: {spliceai}") 6663 if spip and spliceai: 6664 return [ 6665 f"--spip_transcriptome {spip}", 6666 f"--spliceai_transcriptome {spliceai}", 6667 ] 6668 else: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 except TypeError: 6673 log.warning( 6674 "Can't find splice databases in configuration, use annotations file from image" 6675 ) 6676 return [] 6677 6678 # Add options, check if transcriptome option have already beend provided 6679 if ( 6680 "spip_transcriptome" not in nf_params 6681 and "spliceai_transcriptome" not in nf_params 6682 ): 6683 splice_reference = splice_annotations(options, config) 6684 if splice_reference: 6685 nf_params.extend(splice_reference) 6686 # nf_params.append(f"--output_folder {output_folder}") 6687 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6688 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6689 log.debug(cmd) 6690 splice_config["docker"]["command"] = cmd 6691 6692 # Ensure proxy is set 6693 proxy = [ 6694 f"-e {var}={os.getenv(var)}" 6695 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6696 if os.getenv(var) is not None 6697 ] 6698 docker_cmd = get_bin_command( 6699 tool="splice", 6700 bin_type="docker", 6701 config=config, 6702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6703 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6704 ) 6705 # print(docker_cmd) 6706 # exit() 6707 # Docker debug 6708 # if splice_config.get("rm_container"): 6709 # rm_container = "--rm" 6710 # else: 6711 # rm_container = "" 6712 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6713 log.debug(docker_cmd) 6714 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6715 log.debug(res.stdout) 6716 if res.stderr: 6717 log.error(res.stderr) 6718 res.check_returncode() 6719 # Update variants 6720 log.info("Annotation - Updating...") 6721 # Test find output vcf 6722 log.debug( 6723 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6724 ) 6725 output_vcf = [] 6726 # Wrong folder to look in 6727 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6728 if ( 6729 files 6730 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6731 ): 6732 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6733 # log.debug(os.listdir(options.get("output_folder"))) 6734 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6735 if not output_vcf: 6736 log.debug( 6737 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6738 ) 6739 else: 6740 # Get new header from annotated vcf 6741 log.debug(f"Initial header: {len(header.infos)} fields") 6742 # Create new header with splice infos 6743 new_vcf = Variants(input=output_vcf[0]) 6744 new_vcf_header = new_vcf.get_header().infos 6745 for keys, infos in new_vcf_header.items(): 6746 if keys not in header.infos.keys(): 6747 header.infos[keys] = infos 6748 log.debug(f"New header: {len(header.infos)} fields") 6749 log.debug(f"Splice tmp output: {output_vcf[0]}") 6750 self.update_from_vcf(output_vcf[0]) 6751 6752 # Remove file 6753 remove_if_exists(output_vcf) 6754 6755 ### 6756 # Prioritization 6757 ### 6758 6759 def get_config_default(self, name: str) -> dict: 6760 """ 6761 The function `get_config_default` returns a dictionary containing default configurations for 6762 various calculations and prioritizations. 6763 6764 :param name: The `get_config_default` function returns a dictionary containing default 6765 configurations for different calculations and prioritizations. The `name` parameter is used to 6766 specify which specific configuration to retrieve from the dictionary 6767 :type name: str 6768 :return: The function `get_config_default` returns a dictionary containing default configuration 6769 settings for different calculations and prioritizations. The specific configuration settings are 6770 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6771 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6772 returned. If there is no match, an empty dictionary is returned. 6773 """ 6774 6775 config_default = { 6776 "calculations": { 6777 "variant_chr_pos_alt_ref": { 6778 "type": "sql", 6779 "name": "variant_chr_pos_alt_ref", 6780 "description": "Create a variant ID with chromosome, position, alt and ref", 6781 "available": False, 6782 "output_column_name": "variant_chr_pos_alt_ref", 6783 "output_column_type": "String", 6784 "output_column_description": "variant ID with chromosome, position, alt and ref", 6785 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6786 "operation_info": True, 6787 }, 6788 "VARTYPE": { 6789 "type": "sql", 6790 "name": "VARTYPE", 6791 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6792 "available": True, 6793 "table": "variants", 6794 "output_column_name": "VARTYPE", 6795 "output_column_type": "String", 6796 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6797 "operation_query": """ 6798 CASE 6799 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6800 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6801 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6802 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6803 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6804 ELSE 'UNDEFINED' 6805 END 6806 """, 6807 "info_fields": ["SVTYPE"], 6808 "operation_info": True, 6809 }, 6810 "snpeff_hgvs": { 6811 "type": "python", 6812 "name": "snpeff_hgvs", 6813 "description": "HGVS nomenclatures from snpEff annotation", 6814 "available": True, 6815 "function_name": "calculation_extract_snpeff_hgvs", 6816 "function_params": ["snpeff_hgvs", "ANN"], 6817 }, 6818 "snpeff_ann_explode": { 6819 "type": "python", 6820 "name": "snpeff_ann_explode", 6821 "description": "Explode snpEff annotations with uniquify values", 6822 "available": True, 6823 "function_name": "calculation_snpeff_ann_explode", 6824 "function_params": [False, "fields", "snpeff_", "ANN"], 6825 }, 6826 "snpeff_ann_explode_uniquify": { 6827 "type": "python", 6828 "name": "snpeff_ann_explode_uniquify", 6829 "description": "Explode snpEff annotations", 6830 "available": True, 6831 "function_name": "calculation_snpeff_ann_explode", 6832 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6833 }, 6834 "snpeff_ann_explode_json": { 6835 "type": "python", 6836 "name": "snpeff_ann_explode_json", 6837 "description": "Explode snpEff annotations in JSON format", 6838 "available": True, 6839 "function_name": "calculation_snpeff_ann_explode", 6840 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6841 }, 6842 "NOMEN": { 6843 "type": "python", 6844 "name": "NOMEN", 6845 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6846 "available": True, 6847 "function_name": "calculation_extract_nomen", 6848 "function_params": [], 6849 }, 6850 "RENAME_INFO_FIELDS": { 6851 "type": "python", 6852 "name": "RENAME_INFO_FIELDS", 6853 "description": "Rename or remove INFO/tags", 6854 "available": True, 6855 "function_name": "calculation_rename_info_fields", 6856 "function_params": [], 6857 }, 6858 "FINDBYPIPELINE": { 6859 "type": "python", 6860 "name": "FINDBYPIPELINE", 6861 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6862 "available": True, 6863 "function_name": "calculation_find_by_pipeline", 6864 "function_params": ["findbypipeline"], 6865 }, 6866 "FINDBYSAMPLE": { 6867 "type": "python", 6868 "name": "FINDBYSAMPLE", 6869 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6870 "available": True, 6871 "function_name": "calculation_find_by_pipeline", 6872 "function_params": ["findbysample"], 6873 }, 6874 "GENOTYPECONCORDANCE": { 6875 "type": "python", 6876 "name": "GENOTYPECONCORDANCE", 6877 "description": "Concordance of genotype for multi caller VCF", 6878 "available": True, 6879 "function_name": "calculation_genotype_concordance", 6880 "function_params": [], 6881 }, 6882 "BARCODE": { 6883 "type": "python", 6884 "name": "BARCODE", 6885 "description": "BARCODE as VaRank tool", 6886 "available": True, 6887 "function_name": "calculation_barcode", 6888 "function_params": [], 6889 }, 6890 "BARCODEFAMILY": { 6891 "type": "python", 6892 "name": "BARCODEFAMILY", 6893 "description": "BARCODEFAMILY as VaRank tool", 6894 "available": True, 6895 "function_name": "calculation_barcode_family", 6896 "function_params": ["BCF"], 6897 }, 6898 "TRIO": { 6899 "type": "python", 6900 "name": "TRIO", 6901 "description": "Inheritance for a trio family", 6902 "available": True, 6903 "function_name": "calculation_trio", 6904 "function_params": [], 6905 }, 6906 "VAF": { 6907 "type": "python", 6908 "name": "VAF", 6909 "description": "Variant Allele Frequency (VAF) harmonization", 6910 "available": True, 6911 "function_name": "calculation_vaf_normalization", 6912 "function_params": [], 6913 }, 6914 "VAF_stats": { 6915 "type": "python", 6916 "name": "VAF_stats", 6917 "description": "Variant Allele Frequency (VAF) statistics", 6918 "available": True, 6919 "function_name": "calculation_genotype_stats", 6920 "function_params": ["VAF"], 6921 }, 6922 "DP_stats": { 6923 "type": "python", 6924 "name": "DP_stats", 6925 "description": "Depth (DP) statistics", 6926 "available": True, 6927 "function_name": "calculation_genotype_stats", 6928 "function_params": ["DP"], 6929 }, 6930 "variant_id": { 6931 "type": "python", 6932 "name": "variant_id", 6933 "description": "Variant ID generated from variant position and type", 6934 "available": True, 6935 "function_name": "calculation_variant_id", 6936 "function_params": [], 6937 }, 6938 "transcripts_json": { 6939 "type": "python", 6940 "name": "transcripts_json", 6941 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6942 "available": True, 6943 "function_name": "calculation_transcripts_annotation", 6944 "function_params": ["transcripts_json", None], 6945 }, 6946 "transcripts_ann": { 6947 "type": "python", 6948 "name": "transcripts_ann", 6949 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6950 "available": True, 6951 "function_name": "calculation_transcripts_annotation", 6952 "function_params": [None, "transcripts_ann"], 6953 }, 6954 "transcripts_annotations": { 6955 "type": "python", 6956 "name": "transcripts_annotations", 6957 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6958 "available": True, 6959 "function_name": "calculation_transcripts_annotation", 6960 "function_params": [None, None], 6961 }, 6962 "transcripts_prioritization": { 6963 "type": "python", 6964 "name": "transcripts_prioritization", 6965 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6966 "available": True, 6967 "function_name": "calculation_transcripts_prioritization", 6968 "function_params": [], 6969 }, 6970 "transcripts_export": { 6971 "type": "python", 6972 "name": "transcripts_export", 6973 "description": "Export transcripts table/view as a file (using param.json)", 6974 "available": True, 6975 "function_name": "calculation_transcripts_export", 6976 "function_params": [], 6977 }, 6978 }, 6979 "prioritizations": { 6980 "default": { 6981 "ANN2": [ 6982 { 6983 "type": "contains", 6984 "value": "HIGH", 6985 "score": 5, 6986 "flag": "PASS", 6987 "comment": [ 6988 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6989 ], 6990 }, 6991 { 6992 "type": "contains", 6993 "value": "MODERATE", 6994 "score": 3, 6995 "flag": "PASS", 6996 "comment": [ 6997 "A non-disruptive variant that might change protein effectiveness" 6998 ], 6999 }, 7000 { 7001 "type": "contains", 7002 "value": "LOW", 7003 "score": 0, 7004 "flag": "FILTERED", 7005 "comment": [ 7006 "Assumed to be mostly harmless or unlikely to change protein behavior" 7007 ], 7008 }, 7009 { 7010 "type": "contains", 7011 "value": "MODIFIER", 7012 "score": 0, 7013 "flag": "FILTERED", 7014 "comment": [ 7015 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7016 ], 7017 }, 7018 ], 7019 } 7020 }, 7021 } 7022 7023 return config_default.get(name, None) 7024 7025 def get_config_json( 7026 self, name: str, config_dict: dict = {}, config_file: str = None 7027 ) -> dict: 7028 """ 7029 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7030 default values, a dictionary, and a file. 7031 7032 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7033 the name of the configuration. It is used to identify and retrieve the configuration settings 7034 for a specific component or module 7035 :type name: str 7036 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7037 dictionary that allows you to provide additional configuration settings or overrides. When you 7038 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7039 the key is the configuration setting you want to override or 7040 :type config_dict: dict 7041 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7042 specify the path to a configuration file that contains additional settings. If provided, the 7043 function will read the contents of this file and update the configuration dictionary with the 7044 values found in the file, overriding any existing values with the 7045 :type config_file: str 7046 :return: The function `get_config_json` returns a dictionary containing the configuration 7047 settings. 7048 """ 7049 7050 # Create with default prioritizations 7051 config_default = self.get_config_default(name=name) 7052 configuration = config_default 7053 # log.debug(f"configuration={configuration}") 7054 7055 # Replace prioritizations from dict 7056 for config in config_dict: 7057 configuration[config] = config_dict[config] 7058 7059 # Replace prioritizations from file 7060 config_file = full_path(config_file) 7061 if config_file: 7062 if os.path.exists(config_file): 7063 with open(config_file) as config_file_content: 7064 config_file_dict = yaml.safe_load(config_file_content) 7065 for config in config_file_dict: 7066 configuration[config] = config_file_dict[config] 7067 else: 7068 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7069 log.error(msg_error) 7070 raise ValueError(msg_error) 7071 7072 return configuration 7073 7074 def prioritization( 7075 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7076 ) -> bool: 7077 """ 7078 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7079 prioritizes variants based on configured profiles and criteria. 7080 7081 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7082 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7083 a table name is provided, the method will prioritize the variants in that specific table 7084 :type table: str 7085 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7086 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7087 provided, the code will use a default prefix value of "PZ" 7088 :type pz_prefix: str 7089 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7090 additional parameters specific to the prioritization process. These parameters can include 7091 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7092 configurations needed for the prioritization of variants in a V 7093 :type pz_param: dict 7094 :return: A boolean value (True) is being returned from the `prioritization` function. 7095 """ 7096 7097 # Config 7098 config = self.get_config() 7099 7100 # Param 7101 param = self.get_param() 7102 7103 # Prioritization param 7104 if pz_param is not None: 7105 prioritization_param = pz_param 7106 else: 7107 prioritization_param = param.get("prioritization", {}) 7108 7109 # Configuration profiles 7110 prioritization_config_file = prioritization_param.get( 7111 "prioritization_config", None 7112 ) 7113 prioritization_config_file = full_path(prioritization_config_file) 7114 prioritizations_config = self.get_config_json( 7115 name="prioritizations", config_file=prioritization_config_file 7116 ) 7117 7118 # Prioritization prefix 7119 pz_prefix_default = "PZ" 7120 if pz_prefix is None: 7121 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7122 7123 # Prioritization options 7124 profiles = prioritization_param.get("profiles", []) 7125 if isinstance(profiles, str): 7126 profiles = profiles.split(",") 7127 pzfields = prioritization_param.get( 7128 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7129 ) 7130 if isinstance(pzfields, str): 7131 pzfields = pzfields.split(",") 7132 default_profile = prioritization_param.get("default_profile", None) 7133 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7134 prioritization_score_mode = prioritization_param.get( 7135 "prioritization_score_mode", "HOWARD" 7136 ) 7137 7138 # Quick Prioritizations 7139 prioritizations = param.get("prioritizations", None) 7140 if prioritizations: 7141 log.info("Quick Prioritization:") 7142 for profile in prioritizations.split(","): 7143 if profile not in profiles: 7144 profiles.append(profile) 7145 log.info(f" {profile}") 7146 7147 # If profile "ALL" provided, all profiles in the config profiles 7148 if "ALL" in profiles: 7149 profiles = list(prioritizations_config.keys()) 7150 7151 for profile in profiles: 7152 if prioritizations_config.get(profile, None): 7153 log.debug(f"Profile '{profile}' configured") 7154 else: 7155 msg_error = f"Profile '{profile}' NOT configured" 7156 log.error(msg_error) 7157 raise ValueError(msg_error) 7158 7159 if profiles: 7160 log.info(f"Prioritization... ") 7161 else: 7162 log.debug(f"No profile defined") 7163 return False 7164 7165 if not default_profile and len(profiles): 7166 default_profile = profiles[0] 7167 7168 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7169 log.debug("Profiles to check: " + str(list(profiles))) 7170 7171 # Variables 7172 if table is not None: 7173 table_variants = table 7174 else: 7175 table_variants = self.get_table_variants(clause="update") 7176 log.debug(f"Table to prioritize: {table_variants}") 7177 7178 # Added columns 7179 added_columns = [] 7180 7181 # Create list of PZfields 7182 # List of PZFields 7183 list_of_pzfields_original = pzfields + [ 7184 pzfield + pzfields_sep + profile 7185 for pzfield in pzfields 7186 for profile in profiles 7187 ] 7188 list_of_pzfields = [] 7189 log.debug(f"{list_of_pzfields_original}") 7190 7191 # Remove existing PZfields to use if exists 7192 for pzfield in list_of_pzfields_original: 7193 if self.get_header().infos.get(pzfield, None) is None: 7194 list_of_pzfields.append(pzfield) 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7196 else: 7197 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7198 7199 if list_of_pzfields: 7200 7201 # Explode Infos prefix 7202 explode_infos_prefix = self.get_explode_infos_prefix() 7203 7204 # PZfields tags description 7205 PZfields_INFOS = { 7206 f"{pz_prefix}Tags": { 7207 "ID": f"{pz_prefix}Tags", 7208 "Number": ".", 7209 "Type": "String", 7210 "Description": "Variant tags based on annotation criteria", 7211 }, 7212 f"{pz_prefix}Score": { 7213 "ID": f"{pz_prefix}Score", 7214 "Number": 1, 7215 "Type": "Integer", 7216 "Description": "Variant score based on annotation criteria", 7217 }, 7218 f"{pz_prefix}Flag": { 7219 "ID": f"{pz_prefix}Flag", 7220 "Number": 1, 7221 "Type": "String", 7222 "Description": "Variant flag based on annotation criteria", 7223 }, 7224 f"{pz_prefix}Comment": { 7225 "ID": f"{pz_prefix}Comment", 7226 "Number": ".", 7227 "Type": "String", 7228 "Description": "Variant comment based on annotation criteria", 7229 }, 7230 f"{pz_prefix}Infos": { 7231 "ID": f"{pz_prefix}Infos", 7232 "Number": ".", 7233 "Type": "String", 7234 "Description": "Variant infos based on annotation criteria", 7235 }, 7236 f"{pz_prefix}Class": { 7237 "ID": f"{pz_prefix}Class", 7238 "Number": ".", 7239 "Type": "String", 7240 "Description": "Variant class based on annotation criteria", 7241 }, 7242 } 7243 7244 # Create INFO fields if not exist 7245 for field in PZfields_INFOS: 7246 field_ID = PZfields_INFOS[field]["ID"] 7247 field_description = PZfields_INFOS[field]["Description"] 7248 if field_ID not in self.get_header().infos and field_ID in pzfields: 7249 field_description = ( 7250 PZfields_INFOS[field]["Description"] 7251 + f", profile {default_profile}" 7252 ) 7253 self.get_header().infos[field_ID] = vcf.parser._Info( 7254 field_ID, 7255 PZfields_INFOS[field]["Number"], 7256 PZfields_INFOS[field]["Type"], 7257 field_description, 7258 "unknown", 7259 "unknown", 7260 code_type_map[PZfields_INFOS[field]["Type"]], 7261 ) 7262 7263 # Create INFO fields if not exist for each profile 7264 for profile in prioritizations_config: 7265 if profile in profiles or profiles == []: 7266 for field in PZfields_INFOS: 7267 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7268 field_description = ( 7269 PZfields_INFOS[field]["Description"] 7270 + f", profile {profile}" 7271 ) 7272 if ( 7273 field_ID not in self.get_header().infos 7274 and field in pzfields 7275 ): 7276 self.get_header().infos[field_ID] = vcf.parser._Info( 7277 field_ID, 7278 PZfields_INFOS[field]["Number"], 7279 PZfields_INFOS[field]["Type"], 7280 field_description, 7281 "unknown", 7282 "unknown", 7283 code_type_map[PZfields_INFOS[field]["Type"]], 7284 ) 7285 7286 # Header 7287 for pzfield in list_of_pzfields: 7288 if re.match(f"{pz_prefix}Score.*", pzfield): 7289 added_column = self.add_column( 7290 table_name=table_variants, 7291 column_name=pzfield, 7292 column_type="INTEGER", 7293 default_value="0", 7294 ) 7295 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7296 added_column = self.add_column( 7297 table_name=table_variants, 7298 column_name=pzfield, 7299 column_type="BOOLEAN", 7300 default_value="1", 7301 ) 7302 elif re.match(f"{pz_prefix}Class.*", pzfield): 7303 added_column = self.add_column( 7304 table_name=table_variants, 7305 column_name=pzfield, 7306 column_type="VARCHAR[]", 7307 default_value="null", 7308 ) 7309 else: 7310 added_column = self.add_column( 7311 table_name=table_variants, 7312 column_name=pzfield, 7313 column_type="STRING", 7314 default_value="''", 7315 ) 7316 added_columns.append(added_column) 7317 7318 # Profiles 7319 if profiles: 7320 7321 # foreach profile in configuration file 7322 for profile in prioritizations_config: 7323 7324 # If profile is asked in param, or ALL are asked (empty profile []) 7325 if profile in profiles or profiles == []: 7326 log.info(f"Profile '{profile}'") 7327 7328 sql_set_info_option = "" 7329 7330 sql_set_info = [] 7331 7332 # PZ fields set 7333 7334 # PZScore 7335 if ( 7336 f"{pz_prefix}Score{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Score{pzfields_sep}{profile}=', 7343 {pz_prefix}Score{pzfields_sep}{profile} 7344 ) 7345 """ 7346 ) 7347 if ( 7348 profile == default_profile 7349 and f"{pz_prefix}Score" in list_of_pzfields 7350 ): 7351 sql_set_info.append( 7352 f""" 7353 concat( 7354 '{pz_prefix}Score=', 7355 {pz_prefix}Score{pzfields_sep}{profile} 7356 ) 7357 """ 7358 ) 7359 7360 # PZFlag 7361 if ( 7362 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 sql_set_info.append( 7366 f""" 7367 concat( 7368 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7369 CASE 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7371 THEN 'PASS' 7372 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7373 THEN 'FILTERED' 7374 END 7375 ) 7376 """ 7377 ) 7378 if ( 7379 profile == default_profile 7380 and f"{pz_prefix}Flag" in list_of_pzfields 7381 ): 7382 sql_set_info.append( 7383 f""" 7384 concat( 7385 '{pz_prefix}Flag=', 7386 CASE 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7388 THEN 'PASS' 7389 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7390 THEN 'FILTERED' 7391 END 7392 ) 7393 """ 7394 ) 7395 7396 # PZClass 7397 if ( 7398 f"{pz_prefix}Class{pzfields_sep}{profile}" 7399 in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 concat( 7404 '{pz_prefix}Class{pzfields_sep}{profile}=', 7405 CASE 7406 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7407 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7408 ELSE '.' 7409 END 7410 ) 7411 7412 """ 7413 ) 7414 if ( 7415 profile == default_profile 7416 and f"{pz_prefix}Class" in list_of_pzfields 7417 ): 7418 sql_set_info.append( 7419 f""" 7420 concat( 7421 '{pz_prefix}Class=', 7422 CASE 7423 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7424 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7425 ELSE '.' 7426 END 7427 ) 7428 """ 7429 ) 7430 7431 # PZComment 7432 if ( 7433 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7434 in list_of_pzfields 7435 ): 7436 sql_set_info.append( 7437 f""" 7438 CASE 7439 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7440 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7441 ELSE '' 7442 END 7443 """ 7444 ) 7445 if ( 7446 profile == default_profile 7447 and f"{pz_prefix}Comment" in list_of_pzfields 7448 ): 7449 sql_set_info.append( 7450 f""" 7451 CASE 7452 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7453 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7454 ELSE '' 7455 END 7456 """ 7457 ) 7458 7459 # PZInfos 7460 if ( 7461 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7462 in list_of_pzfields 7463 ): 7464 sql_set_info.append( 7465 f""" 7466 CASE 7467 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7468 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7469 ELSE '' 7470 END 7471 """ 7472 ) 7473 if ( 7474 profile == default_profile 7475 and f"{pz_prefix}Infos" in list_of_pzfields 7476 ): 7477 sql_set_info.append( 7478 f""" 7479 CASE 7480 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7481 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7482 ELSE '' 7483 END 7484 """ 7485 ) 7486 7487 # Merge PZfields 7488 sql_set_info_option = "" 7489 sql_set_sep = "" 7490 for sql_set in sql_set_info: 7491 if sql_set_sep: 7492 sql_set_info_option += f""" 7493 , concat('{sql_set_sep}', {sql_set}) 7494 """ 7495 else: 7496 sql_set_info_option += f""" 7497 , {sql_set} 7498 """ 7499 sql_set_sep = ";" 7500 7501 sql_queries = [] 7502 criterion_fields_profile = [] 7503 annotation_view_name = ( 7504 "annotation_view_for_prioritization_" 7505 + str(random.randrange(1000)) 7506 ) 7507 annotation_view_prefix = "" 7508 for annotation in prioritizations_config[profile]: 7509 7510 # skip special sections 7511 if annotation.startswith("_"): 7512 continue 7513 7514 # For each criterions 7515 for criterion in prioritizations_config[profile][ 7516 annotation 7517 ]: 7518 7519 # Criterion mode 7520 criterion_mode = None 7521 if np.any( 7522 np.isin(list(criterion.keys()), ["type", "value"]) 7523 ): 7524 criterion_mode = "operation" 7525 elif np.any( 7526 np.isin(list(criterion.keys()), ["sql", "fields"]) 7527 ): 7528 criterion_mode = "sql" 7529 log.debug(f"Criterion Mode: {criterion_mode}") 7530 7531 # Criterion parameters 7532 criterion_type = criterion.get("type", None) 7533 criterion_value = criterion.get("value", None) 7534 criterion_sql = criterion.get("sql", None) 7535 criterion_fields = criterion.get("fields", None) 7536 criterion_score = criterion.get("score", 0) 7537 criterion_flag = criterion.get("flag", "PASS") 7538 criterion_class = criterion.get("class", None) 7539 criterion_flag_bool = criterion_flag == "PASS" 7540 criterion_comment = ( 7541 ", ".join(criterion.get("comment", [])) 7542 .replace("'", "''") 7543 .replace(";", ",") 7544 .replace("\t", " ") 7545 ) 7546 criterion_infos = ( 7547 str(criterion) 7548 .replace("'", "''") 7549 .replace(";", ",") 7550 .replace("\t", " ") 7551 ) 7552 7553 # SQL 7554 if criterion_sql is not None and isinstance( 7555 criterion_sql, list 7556 ): 7557 criterion_sql = " ".join(criterion_sql) 7558 7559 # Fields and explode 7560 if criterion_fields is None: 7561 criterion_fields = [annotation] 7562 if not isinstance(criterion_fields, list): 7563 criterion_fields = str(criterion_fields).split(",") 7564 7565 # Class 7566 if criterion_class is not None and not isinstance( 7567 criterion_class, list 7568 ): 7569 criterion_class = str(criterion_class).split(",") 7570 7571 # Add criterion fields to the list of profile's criteria 7572 criterion_fields_profile = list( 7573 set(criterion_fields_profile + criterion_fields) 7574 ) 7575 7576 sql_set = [] 7577 sql_set_info = [] 7578 7579 # PZ fields set 7580 7581 # PZScore 7582 if ( 7583 f"{pz_prefix}Score{pzfields_sep}{profile}" 7584 in list_of_pzfields 7585 ): 7586 # VaRank prioritization score mode 7587 if prioritization_score_mode.upper().strip() in [ 7588 "VARANK", 7589 "MAX", 7590 "MAXIMUM", 7591 "TOP", 7592 ]: 7593 sql_set.append( 7594 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7595 ) 7596 # default HOWARD prioritization score mode 7597 else: 7598 sql_set.append( 7599 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7600 ) 7601 7602 # PZFlag 7603 if ( 7604 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7605 in list_of_pzfields 7606 ): 7607 sql_set.append( 7608 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7609 ) 7610 7611 # PZClass 7612 if ( 7613 f"{pz_prefix}Class{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 and criterion_class is not None 7616 ): 7617 sql_set.append( 7618 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7619 ) 7620 7621 # PZComment 7622 if ( 7623 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7624 in list_of_pzfields 7625 ): 7626 sql_set.append( 7627 f""" 7628 {pz_prefix}Comment{pzfields_sep}{profile} = 7629 concat( 7630 {pz_prefix}Comment{pzfields_sep}{profile}, 7631 CASE 7632 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7633 THEN ', ' 7634 ELSE '' 7635 END, 7636 '{criterion_comment}' 7637 ) 7638 """ 7639 ) 7640 7641 # PZInfos 7642 if ( 7643 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7644 in list_of_pzfields 7645 ): 7646 sql_set.append( 7647 f""" 7648 {pz_prefix}Infos{pzfields_sep}{profile} = 7649 concat( 7650 {pz_prefix}Infos{pzfields_sep}{profile}, 7651 '{criterion_infos}' 7652 ) 7653 """ 7654 ) 7655 sql_set_option = ",".join(sql_set) 7656 7657 # Criterion and comparison 7658 if sql_set_option: 7659 7660 # Operation mode 7661 if criterion_mode in ["operation"]: 7662 7663 # Check if value is a float 7664 try: 7665 float(criterion_value) 7666 sql_update = f""" 7667 UPDATE "{table_variants}" 7668 SET {sql_set_option} 7669 FROM ( 7670 SELECT * 7671 FROM "{annotation_view_name}" 7672 WHERE ( 7673 CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7674 AND CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7675 ) 7676 ) AS "{annotation_view_name}" 7677 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7678 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7679 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7680 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7681 7682 """ 7683 # If not a floatÃ’ 7684 except: 7685 contains_option = "" 7686 if criterion_type == "contains": 7687 contains_option = ".*" 7688 sql_update = f""" 7689 UPDATE "{table_variants}" 7690 SET {sql_set_option} 7691 FROM ( 7692 SELECT * 7693 FROM "{annotation_view_name}" 7694 WHERE ( 7695 "{annotation_view_name}"."{annotation_view_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7696 ) 7697 ) AS "{annotation_view_name}" 7698 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7699 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7700 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7701 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7702 7703 """ 7704 sql_queries.append(sql_update) 7705 7706 # SQL mode 7707 elif criterion_mode in ["sql"]: 7708 7709 sql_update = f""" 7710 UPDATE {table_variants} 7711 SET {sql_set_option} 7712 FROM ( 7713 SELECT * 7714 FROM "{annotation_view_name}" 7715 WHERE ({criterion_sql}) 7716 ) AS "{annotation_view_name}" 7717 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7718 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7719 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7720 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7721 """ 7722 sql_queries.append(sql_update) 7723 7724 else: 7725 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7726 log.error(msg_err) 7727 raise ValueError(msg_err) 7728 7729 else: 7730 log.warning( 7731 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7732 ) 7733 7734 # PZTags 7735 if ( 7736 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7737 in list_of_pzfields 7738 ): 7739 7740 # Create PZFalgs value 7741 pztags_value = "" 7742 pztags_sep_default = "," 7743 pztags_sep = "" 7744 for pzfield in pzfields: 7745 if pzfield not in [f"{pz_prefix}Tags"]: 7746 if ( 7747 f"{pzfield}{pzfields_sep}{profile}" 7748 in list_of_pzfields 7749 ): 7750 if pzfield in [f"{pz_prefix}Flag"]: 7751 pztags_value += f"""{pztags_sep}{pzfield}#', 7752 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7753 THEN 'PASS' 7754 ELSE 'FILTERED' 7755 END, '""" 7756 elif pzfield in [f"{pz_prefix}Class"]: 7757 pztags_value += f"""{pztags_sep}{pzfield}#', 7758 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7759 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7760 ELSE '.' 7761 END, '""" 7762 else: 7763 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7764 pztags_sep = pztags_sep_default 7765 7766 # Add Query update for PZFlags 7767 sql_update_pztags = f""" 7768 UPDATE {table_variants} 7769 SET INFO = concat( 7770 INFO, 7771 CASE WHEN INFO NOT in ('','.') 7772 THEN ';' 7773 ELSE '' 7774 END, 7775 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7776 ) 7777 WHERE 1=1 7778 """ 7779 sql_queries.append(sql_update_pztags) 7780 7781 # Add Query update for PZFlags for default 7782 if profile == default_profile: 7783 sql_update_pztags_default = f""" 7784 UPDATE {table_variants} 7785 SET INFO = concat( 7786 INFO, 7787 ';', 7788 '{pz_prefix}Tags={pztags_value}' 7789 ) 7790 WHERE 1=1 7791 """ 7792 sql_queries.append(sql_update_pztags_default) 7793 7794 log.info(f"""Profile '{profile}' - Prioritization... """) 7795 7796 # Create annotations view for prioritization 7797 log.debug( 7798 f"""Profile '{profile}' - Prioritization - Create '{annotation_view_name}' view with '{criterion_fields_profile}'... """ 7799 ) 7800 annotation_view = self.create_annotations_view( 7801 view=annotation_view_name, 7802 prefix=annotation_view_prefix, 7803 fields=criterion_fields_profile, 7804 drop_view=True, 7805 ) 7806 7807 # Chromosomes list 7808 sql_uniq_chrom = f""" 7809 SELECT DISTINCT "#CHROM" 7810 FROM {table_variants} 7811 """ 7812 chroms = self.get_query_to_df(sql_uniq_chrom)["#CHROM"].tolist() 7813 7814 for chrom in chroms: 7815 7816 log.debug( 7817 f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}'... """ 7818 ) 7819 7820 if sql_queries: 7821 7822 # Query num 7823 num_query = 0 7824 7825 # For each query 7826 for sql_query in sql_queries: 7827 7828 # Query num 7829 num_query += 1 7830 7831 sql_query_chrom = f""" 7832 {sql_query} 7833 AND {table_variants}."#CHROM" LIKE '{chrom}' 7834 """ 7835 log.debug( 7836 f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}' [{num_query}/{len(sql_queries)}]""" 7837 ) 7838 # log.debug(f"""sql_query_chrom: {sql_query_chrom}""") 7839 self.execute_query(query=sql_query_chrom) 7840 7841 # Update INFO field 7842 log.info(f"""Profile '{profile}' - Update... """) 7843 sql_query_update = f""" 7844 UPDATE {table_variants} 7845 SET INFO = 7846 concat( 7847 CASE 7848 WHEN INFO NOT IN ('','.') 7849 THEN concat(INFO, ';') 7850 ELSE '' 7851 END 7852 {sql_set_info_option} 7853 ) 7854 """ 7855 # log.debug(f"sql_query_update={sql_query_update}") 7856 self.execute_query(query=sql_query_update) 7857 7858 # Remove annotations view for prioritization 7859 query_drop_tmp_table = f""" 7860 DROP VIEW IF EXISTS {annotation_view_name} 7861 """ 7862 self.execute_query(query=query_drop_tmp_table) 7863 7864 else: 7865 7866 log.warning(f"No profiles in parameters") 7867 7868 # Remove added columns 7869 for added_column in added_columns: 7870 self.drop_column(column=added_column) 7871 7872 # Explode INFOS fields into table fields 7873 if self.get_explode_infos(): 7874 self.explode_infos( 7875 prefix=self.get_explode_infos_prefix(), 7876 fields=self.get_explode_infos_fields(), 7877 force=True, 7878 ) 7879 7880 return True 7881 7882 ### 7883 # HGVS 7884 ### 7885 7886 def annotation_hgvs(self, threads: int = None) -> None: 7887 """ 7888 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7889 coordinates and alleles. 7890 7891 :param threads: The `threads` parameter is an optional integer that specifies the number of 7892 threads to use for parallel processing. If no value is provided, it will default to the number 7893 of threads obtained from the `get_threads()` method 7894 :type threads: int 7895 """ 7896 7897 # Function for each partition of the Dask Dataframe 7898 def partition_function(partition): 7899 """ 7900 The function `partition_function` applies the `annotation_hgvs_partition` function to 7901 each row of a DataFrame called `partition`. 7902 7903 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7904 to be processed 7905 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7906 the "partition" dataframe along the axis 1. 7907 """ 7908 return partition.apply(annotation_hgvs_partition, axis=1) 7909 7910 def annotation_hgvs_partition(row) -> str: 7911 """ 7912 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7913 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7914 7915 :param row: A dictionary-like object that contains the values for the following keys: 7916 :return: a string that contains the HGVS names associated with the given row of data. 7917 """ 7918 7919 chr = row["CHROM"] 7920 pos = row["POS"] 7921 ref = row["REF"] 7922 alt = row["ALT"] 7923 7924 # Find list of associated transcripts 7925 transcripts_list = list( 7926 polars_conn.execute( 7927 f""" 7928 SELECT transcript 7929 FROM refseq_df 7930 WHERE CHROM='{chr}' 7931 AND POS={pos} 7932 """ 7933 )["transcript"] 7934 ) 7935 7936 # Full HGVS annotation in list 7937 hgvs_full_list = [] 7938 7939 for transcript_name in transcripts_list: 7940 7941 # Transcript 7942 transcript = get_transcript( 7943 transcripts=transcripts, transcript_name=transcript_name 7944 ) 7945 # Exon 7946 if use_exon: 7947 exon = transcript.find_exon_number(pos) 7948 else: 7949 exon = None 7950 # Protein 7951 transcript_protein = None 7952 if use_protein or add_protein or full_format: 7953 transcripts_protein = list( 7954 polars_conn.execute( 7955 f""" 7956 SELECT protein 7957 FROM refseqlink_df 7958 WHERE transcript='{transcript_name}' 7959 LIMIT 1 7960 """ 7961 )["protein"] 7962 ) 7963 if len(transcripts_protein): 7964 transcript_protein = transcripts_protein[0] 7965 7966 # HGVS name 7967 hgvs_name = format_hgvs_name( 7968 chr, 7969 pos, 7970 ref, 7971 alt, 7972 genome=genome, 7973 transcript=transcript, 7974 transcript_protein=transcript_protein, 7975 exon=exon, 7976 use_gene=use_gene, 7977 use_protein=use_protein, 7978 full_format=full_format, 7979 use_version=use_version, 7980 codon_type=codon_type, 7981 ) 7982 hgvs_full_list.append(hgvs_name) 7983 if add_protein and not use_protein and not full_format: 7984 hgvs_name = format_hgvs_name( 7985 chr, 7986 pos, 7987 ref, 7988 alt, 7989 genome=genome, 7990 transcript=transcript, 7991 transcript_protein=transcript_protein, 7992 exon=exon, 7993 use_gene=use_gene, 7994 use_protein=True, 7995 full_format=False, 7996 use_version=use_version, 7997 codon_type=codon_type, 7998 ) 7999 hgvs_full_list.append(hgvs_name) 8000 8001 # Create liste of HGVS annotations 8002 hgvs_full = ",".join(hgvs_full_list) 8003 8004 return hgvs_full 8005 8006 # Polars connexion 8007 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8008 8009 # Config 8010 config = self.get_config() 8011 8012 # Databases 8013 # Genome 8014 databases_genomes_folders = ( 8015 config.get("folders", {}) 8016 .get("databases", {}) 8017 .get("genomes", DEFAULT_GENOME_FOLDER) 8018 ) 8019 databases_genome = ( 8020 config.get("folders", {}).get("databases", {}).get("genomes", "") 8021 ) 8022 # refseq database folder 8023 databases_refseq_folders = ( 8024 config.get("folders", {}) 8025 .get("databases", {}) 8026 .get("refseq", DEFAULT_REFSEQ_FOLDER) 8027 ) 8028 # refseq 8029 databases_refseq = config.get("databases", {}).get("refSeq", None) 8030 # refSeqLink 8031 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 8032 8033 # Param 8034 param = self.get_param() 8035 8036 # Quick HGVS 8037 if "hgvs_options" in param and param.get("hgvs_options", ""): 8038 log.info(f"Quick HGVS Annotation:") 8039 if not param.get("hgvs", None): 8040 param["hgvs"] = {} 8041 for option in param.get("hgvs_options", "").split(","): 8042 option_var_val = option.split("=") 8043 option_var = option_var_val[0] 8044 if len(option_var_val) > 1: 8045 option_val = option_var_val[1] 8046 else: 8047 option_val = "True" 8048 if option_val.upper() in ["TRUE"]: 8049 option_val = True 8050 elif option_val.upper() in ["FALSE"]: 8051 option_val = False 8052 log.info(f" {option_var}={option_val}") 8053 param["hgvs"][option_var] = option_val 8054 8055 # Check if HGVS annotation enabled 8056 if "hgvs" in param: 8057 log.info(f"HGVS Annotation... ") 8058 for hgvs_option in param.get("hgvs", {}): 8059 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 8060 else: 8061 return 8062 8063 # HGVS Param 8064 param_hgvs = param.get("hgvs", {}) 8065 use_exon = param_hgvs.get("use_exon", False) 8066 use_gene = param_hgvs.get("use_gene", False) 8067 use_protein = param_hgvs.get("use_protein", False) 8068 add_protein = param_hgvs.get("add_protein", False) 8069 full_format = param_hgvs.get("full_format", False) 8070 use_version = param_hgvs.get("use_version", False) 8071 codon_type = param_hgvs.get("codon_type", "3") 8072 8073 # refSseq refSeqLink 8074 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8075 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8076 8077 # Assembly 8078 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8079 8080 # Genome 8081 genome_file = None 8082 if find_genome(databases_genome): 8083 genome_file = find_genome(databases_genome) 8084 else: 8085 genome_file = find_genome( 8086 genome_path=databases_genomes_folders, assembly=assembly 8087 ) 8088 log.debug("Genome: " + str(genome_file)) 8089 8090 # refSseq 8091 refseq_file = find_file_prefix( 8092 input_file=databases_refseq, 8093 prefix="ncbiRefSeq", 8094 folder=databases_refseq_folders, 8095 assembly=assembly, 8096 ) 8097 log.debug("refSeq: " + str(refseq_file)) 8098 8099 # refSeqLink 8100 refseqlink_file = find_file_prefix( 8101 input_file=databases_refseqlink, 8102 prefix="ncbiRefSeqLink", 8103 folder=databases_refseq_folders, 8104 assembly=assembly, 8105 ) 8106 log.debug("refSeqLink: " + str(refseqlink_file)) 8107 8108 # Threads 8109 if not threads: 8110 threads = self.get_threads() 8111 log.debug("Threads: " + str(threads)) 8112 8113 # Variables 8114 table_variants = self.get_table_variants(clause="update") 8115 8116 # Get variants SNV and InDel only 8117 query_variants = f""" 8118 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8119 FROM {table_variants} 8120 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8121 """ 8122 df_variants = self.get_query_to_df(query_variants) 8123 8124 # Added columns 8125 added_columns = [] 8126 8127 # Add hgvs column in variants table 8128 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8129 added_column = self.add_column( 8130 table_variants, hgvs_column_name, "STRING", default_value=None 8131 ) 8132 added_columns.append(added_column) 8133 8134 log.debug(f"refSeq loading...") 8135 # refSeq in duckDB 8136 refseq_table = get_refseq_table( 8137 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8138 ) 8139 # Loading all refSeq in Dataframe 8140 refseq_query = f""" 8141 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8142 FROM {refseq_table} 8143 JOIN df_variants ON ( 8144 {refseq_table}.chrom = df_variants.CHROM 8145 AND {refseq_table}.txStart<=df_variants.POS 8146 AND {refseq_table}.txEnd>=df_variants.POS 8147 ) 8148 """ 8149 refseq_df = self.conn.query(refseq_query).pl() 8150 8151 if refseqlink_file: 8152 log.debug(f"refSeqLink loading...") 8153 # refSeqLink in duckDB 8154 refseqlink_table = get_refseq_table( 8155 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8156 ) 8157 # Loading all refSeqLink in Dataframe 8158 protacc_column = "protAcc_with_ver" 8159 mrnaacc_column = "mrnaAcc_with_ver" 8160 refseqlink_query = f""" 8161 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8162 FROM {refseqlink_table} 8163 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8164 WHERE protAcc_without_ver IS NOT NULL 8165 """ 8166 # Polars Dataframe 8167 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8168 8169 # Read RefSeq transcripts into a python dict/model. 8170 log.debug(f"Transcripts loading...") 8171 with tempfile.TemporaryDirectory() as tmpdir: 8172 transcripts_query = f""" 8173 COPY ( 8174 SELECT {refseq_table}.* 8175 FROM {refseq_table} 8176 JOIN df_variants ON ( 8177 {refseq_table}.chrom=df_variants.CHROM 8178 AND {refseq_table}.txStart<=df_variants.POS 8179 AND {refseq_table}.txEnd>=df_variants.POS 8180 ) 8181 ) 8182 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8183 """ 8184 self.conn.query(transcripts_query) 8185 with open(f"{tmpdir}/transcript.tsv") as infile: 8186 transcripts = read_transcripts(infile) 8187 8188 # Polars connexion 8189 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8190 8191 log.debug("Genome loading...") 8192 # Read genome sequence using pyfaidx. 8193 genome = Fasta(genome_file) 8194 8195 log.debug("Start annotation HGVS...") 8196 8197 # Create 8198 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8199 ddf = dd.from_pandas(df_variants, npartitions=threads) 8200 8201 # Use dask.dataframe.apply() to apply function on each partition 8202 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8203 8204 # Convert Dask DataFrame to Pandas Dataframe 8205 df = ddf.compute() 8206 8207 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8208 with tempfile.TemporaryDirectory() as tmpdir: 8209 df_parquet = os.path.join(tmpdir, "df.parquet") 8210 df.to_parquet(df_parquet) 8211 8212 # Update hgvs column 8213 update_variant_query = f""" 8214 UPDATE {table_variants} 8215 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8216 FROM read_parquet('{df_parquet}') as df 8217 WHERE variants."#CHROM" = df.CHROM 8218 AND variants.POS = df.POS 8219 AND variants.REF = df.REF 8220 AND variants.ALT = df.ALT 8221 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8222 """ 8223 self.execute_query(update_variant_query) 8224 8225 # Update INFO column 8226 sql_query_update = f""" 8227 UPDATE {table_variants} 8228 SET INFO = 8229 concat( 8230 CASE 8231 WHEN INFO NOT IN ('','.') 8232 THEN concat(INFO, ';') 8233 ELSE '' 8234 END, 8235 'hgvs=', 8236 {hgvs_column_name} 8237 ) 8238 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8239 """ 8240 self.execute_query(sql_query_update) 8241 8242 # Add header 8243 HGVS_INFOS = { 8244 "hgvs": { 8245 "ID": "hgvs", 8246 "Number": ".", 8247 "Type": "String", 8248 "Description": f"HGVS annotatation with HOWARD", 8249 } 8250 } 8251 8252 for field in HGVS_INFOS: 8253 field_ID = HGVS_INFOS[field]["ID"] 8254 field_description = HGVS_INFOS[field]["Description"] 8255 self.get_header().infos[field_ID] = vcf.parser._Info( 8256 field_ID, 8257 HGVS_INFOS[field]["Number"], 8258 HGVS_INFOS[field]["Type"], 8259 field_description, 8260 "unknown", 8261 "unknown", 8262 code_type_map[HGVS_INFOS[field]["Type"]], 8263 ) 8264 8265 # Remove added columns 8266 for added_column in added_columns: 8267 self.drop_column(column=added_column) 8268 8269 ### 8270 # Calculation 8271 ### 8272 8273 def get_operations_help( 8274 self, operations_config_dict: dict = {}, operations_config_file: str = None 8275 ) -> list: 8276 8277 # Init 8278 operations_help = [] 8279 8280 # operations 8281 operations = self.get_config_json( 8282 name="calculations", 8283 config_dict=operations_config_dict, 8284 config_file=operations_config_file, 8285 ) 8286 for op in operations: 8287 op_name = operations[op].get("name", op).upper() 8288 op_description = operations[op].get("description", op_name) 8289 op_available = operations[op].get("available", False) 8290 if op_available: 8291 operations_help.append(f" {op_name}: {op_description}") 8292 8293 # Sort operations 8294 operations_help.sort() 8295 8296 # insert header 8297 operations_help.insert(0, "Available calculation operations:") 8298 8299 # Return 8300 return operations_help 8301 8302 def calculation( 8303 self, 8304 operations: dict = {}, 8305 operations_config_dict: dict = {}, 8306 operations_config_file: str = None, 8307 ) -> None: 8308 """ 8309 It takes a list of operations, and for each operation, it checks if it's a python or sql 8310 operation, and then calls the appropriate function 8311 8312 param json example: 8313 "calculation": { 8314 "NOMEN": { 8315 "options": { 8316 "hgvs_field": "hgvs" 8317 }, 8318 "middle" : null 8319 } 8320 """ 8321 8322 # Param 8323 param = self.get_param() 8324 8325 # CHeck operations config file 8326 if operations_config_file is None: 8327 operations_config_file = param.get("calculation", {}).get( 8328 "calculation_config", None 8329 ) 8330 8331 # operations config 8332 operations_config = self.get_config_json( 8333 name="calculations", 8334 config_dict=operations_config_dict, 8335 config_file=operations_config_file, 8336 ) 8337 8338 # Upper keys 8339 operations_config = {k.upper(): v for k, v in operations_config.items()} 8340 8341 # Calculations 8342 8343 # Operations from param 8344 operations = param.get("calculation", {}).get("calculations", operations) 8345 8346 # Quick calculation - add 8347 if param.get("calculations", None): 8348 8349 # List of operations 8350 calculations_list = [ 8351 value.strip() for value in param.get("calculations", "").split(",") 8352 ] 8353 8354 # Log 8355 log.info(f"Quick Calculations:") 8356 for calculation_key in calculations_list: 8357 log.info(f" {calculation_key}") 8358 8359 # Create tmp operations (to keep operation order) 8360 operations_tmp = {} 8361 for calculation_operation in calculations_list: 8362 if calculation_operation.upper() not in operations_tmp: 8363 log.debug( 8364 f"{calculation_operation}.upper() not in {operations_tmp}" 8365 ) 8366 operations_tmp[calculation_operation.upper()] = {} 8367 add_value_into_dict( 8368 dict_tree=operations_tmp, 8369 sections=[ 8370 calculation_operation.upper(), 8371 ], 8372 value=operations.get(calculation_operation.upper(), {}), 8373 ) 8374 # Add operations already in param 8375 for calculation_operation in operations: 8376 if calculation_operation not in operations_tmp: 8377 operations_tmp[calculation_operation] = operations.get( 8378 calculation_operation, {} 8379 ) 8380 8381 # Update operations in param 8382 operations = operations_tmp 8383 8384 # Operations for calculation 8385 if not operations: 8386 operations = param.get("calculation", {}).get("calculations", {}) 8387 8388 if operations: 8389 log.info(f"Calculations...") 8390 8391 # For each operations 8392 for operation_name in operations: 8393 operation_name = operation_name.upper() 8394 if operation_name not in [""]: 8395 if operation_name in operations_config: 8396 log.info(f"Calculation '{operation_name}'") 8397 operation = operations_config[operation_name] 8398 operation_type = operation.get("type", "sql") 8399 if operation_type == "python": 8400 self.calculation_process_function( 8401 operation=operation, operation_name=operation_name 8402 ) 8403 elif operation_type == "sql": 8404 self.calculation_process_sql( 8405 operation=operation, operation_name=operation_name 8406 ) 8407 else: 8408 log.error( 8409 f"Operations config: Type '{operation_type}' NOT available" 8410 ) 8411 raise ValueError( 8412 f"Operations config: Type '{operation_type}' NOT available" 8413 ) 8414 else: 8415 log.error( 8416 f"Operations config: Calculation '{operation_name}' NOT available" 8417 ) 8418 raise ValueError( 8419 f"Operations config: Calculation '{operation_name}' NOT available" 8420 ) 8421 8422 # Explode INFOS fields into table fields 8423 if self.get_explode_infos(): 8424 self.explode_infos( 8425 prefix=self.get_explode_infos_prefix(), 8426 fields=self.get_explode_infos_fields(), 8427 force=True, 8428 ) 8429 8430 def calculation_process_sql( 8431 self, operation: dict, operation_name: str = "unknown" 8432 ) -> None: 8433 """ 8434 The `calculation_process_sql` function takes in a mathematical operation as a string and 8435 performs the operation, updating the specified table with the result. 8436 8437 :param operation: The `operation` parameter is a dictionary that contains information about the 8438 mathematical operation to be performed. It includes the following keys: 8439 :type operation: dict 8440 :param operation_name: The `operation_name` parameter is a string that represents the name of 8441 the mathematical operation being performed. It is used for logging and error handling purposes, 8442 defaults to unknown 8443 :type operation_name: str (optional) 8444 """ 8445 8446 # Operation infos 8447 operation_name = operation.get("name", "unknown") 8448 log.debug(f"process SQL {operation_name}") 8449 output_column_name = operation.get("output_column_name", operation_name) 8450 output_column_type = operation.get("output_column_type", "String") 8451 prefix = operation.get("explode_infos_prefix", "") 8452 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8453 output_column_description = operation.get( 8454 "output_column_description", f"{operation_name} operation" 8455 ) 8456 operation_query = operation.get("operation_query", None) 8457 if isinstance(operation_query, list): 8458 operation_query = " ".join(operation_query) 8459 operation_info_fields = operation.get("info_fields", []) 8460 operation_info_fields_check = operation.get("info_fields_check", False) 8461 operation_info = operation.get("operation_info", True) 8462 operation_table = operation.get( 8463 "table", self.get_table_variants(clause="alter") 8464 ) 8465 8466 # table variants 8467 if operation_table: 8468 table_variants = operation_table 8469 else: 8470 table_variants = self.get_table_variants(clause="alter") 8471 8472 if operation_query: 8473 8474 # Info fields check 8475 operation_info_fields_check_result = True 8476 if operation_info_fields_check: 8477 header_infos = self.get_header().infos 8478 for info_field in operation_info_fields: 8479 operation_info_fields_check_result = ( 8480 operation_info_fields_check_result 8481 and info_field in header_infos 8482 ) 8483 8484 # If info fields available 8485 if operation_info_fields_check_result: 8486 8487 # Added_columns 8488 added_columns = [] 8489 8490 # Create VCF header field 8491 vcf_reader = self.get_header() 8492 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8493 output_column_name, 8494 ".", 8495 output_column_type, 8496 output_column_description, 8497 "howard calculation", 8498 "0", 8499 self.code_type_map.get(output_column_type), 8500 ) 8501 8502 # Explode infos if needed 8503 log.debug(f"calculation_process_sql prefix {prefix}") 8504 added_columns += self.explode_infos( 8505 prefix=prefix, 8506 fields=[output_column_name] + operation_info_fields, 8507 force=False, 8508 table=table_variants, 8509 ) 8510 8511 # Create column 8512 added_column = self.add_column( 8513 table_name=table_variants, 8514 column_name=prefix + output_column_name, 8515 column_type=output_column_type_sql, 8516 default_value="null", 8517 ) 8518 added_columns.append(added_column) 8519 8520 # Operation calculation 8521 try: 8522 8523 # Query to update calculation column 8524 sql_update = f""" 8525 UPDATE {table_variants} 8526 SET "{prefix}{output_column_name}" = ({operation_query}) 8527 """ 8528 self.conn.execute(sql_update) 8529 8530 # Add to INFO 8531 if operation_info: 8532 sql_update_info = f""" 8533 UPDATE {table_variants} 8534 SET "INFO" = 8535 concat( 8536 CASE 8537 WHEN "INFO" IS NOT NULL 8538 THEN concat("INFO", ';') 8539 ELSE '' 8540 END, 8541 '{output_column_name}=', 8542 "{prefix}{output_column_name}" 8543 ) 8544 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8545 """ 8546 self.conn.execute(sql_update_info) 8547 8548 except: 8549 log.error( 8550 f"Operations config: Calculation '{operation_name}' query failed" 8551 ) 8552 raise ValueError( 8553 f"Operations config: Calculation '{operation_name}' query failed" 8554 ) 8555 8556 # Remove added columns 8557 for added_column in added_columns: 8558 log.debug(f"added_column: {added_column}") 8559 self.drop_column(column=added_column) 8560 8561 else: 8562 log.error( 8563 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8564 ) 8565 raise ValueError( 8566 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8567 ) 8568 8569 else: 8570 log.error( 8571 f"Operations config: Calculation '{operation_name}' query NOT defined" 8572 ) 8573 raise ValueError( 8574 f"Operations config: Calculation '{operation_name}' query NOT defined" 8575 ) 8576 8577 def calculation_process_function( 8578 self, operation: dict, operation_name: str = "unknown" 8579 ) -> None: 8580 """ 8581 The `calculation_process_function` takes in an operation dictionary and performs the specified 8582 function with the given parameters. 8583 8584 :param operation: The `operation` parameter is a dictionary that contains information about the 8585 operation to be performed. It has the following keys: 8586 :type operation: dict 8587 :param operation_name: The `operation_name` parameter is a string that represents the name of 8588 the operation being performed. It is used for logging purposes, defaults to unknown 8589 :type operation_name: str (optional) 8590 """ 8591 8592 operation_name = operation["name"] 8593 log.debug(f"process Python {operation_name}") 8594 function_name = operation["function_name"] 8595 function_params = operation["function_params"] 8596 getattr(self, function_name)(*function_params) 8597 8598 def calculation_variant_id(self) -> None: 8599 """ 8600 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8601 updates the INFO field of a variants table with the variant ID. 8602 """ 8603 8604 # variant_id annotation field 8605 variant_id_tag = self.get_variant_id_column() 8606 added_columns = [variant_id_tag] 8607 8608 # variant_id hgvs tags" 8609 vcf_infos_tags = { 8610 variant_id_tag: "howard variant ID annotation", 8611 } 8612 8613 # Variants table 8614 table_variants = self.get_table_variants() 8615 8616 # Header 8617 vcf_reader = self.get_header() 8618 8619 # Add variant_id to header 8620 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8621 variant_id_tag, 8622 ".", 8623 "String", 8624 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8625 "howard calculation", 8626 "0", 8627 self.code_type_map.get("String"), 8628 ) 8629 8630 # Update 8631 sql_update = f""" 8632 UPDATE {table_variants} 8633 SET "INFO" = 8634 concat( 8635 CASE 8636 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8637 THEN '' 8638 ELSE concat("INFO", ';') 8639 END, 8640 '{variant_id_tag}=', 8641 "{variant_id_tag}" 8642 ) 8643 """ 8644 self.conn.execute(sql_update) 8645 8646 # Remove added columns 8647 for added_column in added_columns: 8648 self.drop_column(column=added_column) 8649 8650 def calculation_extract_snpeff_hgvs( 8651 self, 8652 snpeff_hgvs: str = "snpeff_hgvs", 8653 snpeff_field: str = "ANN", 8654 ) -> None: 8655 """ 8656 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8657 annotation field in a VCF file and adds them as a new column in the variants table. 8658 8659 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8660 function is used to specify the name of the column that will store the HGVS nomenclatures 8661 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8662 snpeff_hgvs 8663 :type snpeff_hgvs: str (optional) 8664 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8665 function represents the field in the VCF file that contains SnpEff annotations. This field is 8666 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8667 to ANN 8668 :type snpeff_field: str (optional) 8669 """ 8670 8671 # Snpeff hgvs tags 8672 vcf_infos_tags = { 8673 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8674 } 8675 8676 # Prefix 8677 prefix = self.get_explode_infos_prefix() 8678 if prefix: 8679 prefix = "INFO/" 8680 8681 # snpEff fields 8682 speff_ann_infos = prefix + snpeff_field 8683 speff_hgvs_infos = prefix + snpeff_hgvs 8684 8685 # Variants table 8686 table_variants = self.get_table_variants() 8687 8688 # Header 8689 vcf_reader = self.get_header() 8690 8691 # Add columns 8692 added_columns = [] 8693 8694 # Explode HGVS field in column 8695 added_columns += self.explode_infos(fields=[snpeff_field]) 8696 8697 if snpeff_field in vcf_reader.infos: 8698 8699 log.debug(vcf_reader.infos[snpeff_field]) 8700 8701 # Extract ANN header 8702 ann_description = vcf_reader.infos[snpeff_field].desc 8703 pattern = r"'(.+?)'" 8704 match = re.search(pattern, ann_description) 8705 if match: 8706 ann_header_match = match.group(1).split(" | ") 8707 ann_header_desc = {} 8708 for i in range(len(ann_header_match)): 8709 ann_header_info = "".join( 8710 char for char in ann_header_match[i] if char.isalnum() 8711 ) 8712 ann_header_desc[ann_header_info] = ann_header_match[i] 8713 if not ann_header_desc: 8714 raise ValueError("Invalid header description format") 8715 else: 8716 raise ValueError("Invalid header description format") 8717 8718 # Create variant id 8719 variant_id_column = self.get_variant_id_column() 8720 added_columns += [variant_id_column] 8721 8722 # Create dataframe 8723 dataframe_snpeff_hgvs = self.get_query_to_df( 8724 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8725 ) 8726 8727 # Create main NOMEN column 8728 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8729 speff_ann_infos 8730 ].apply( 8731 lambda x: extract_snpeff_hgvs( 8732 str(x), header=list(ann_header_desc.values()) 8733 ) 8734 ) 8735 8736 # Add snpeff_hgvs to header 8737 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8738 snpeff_hgvs, 8739 ".", 8740 "String", 8741 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8742 "howard calculation", 8743 "0", 8744 self.code_type_map.get("String"), 8745 ) 8746 8747 # Update 8748 sql_update = f""" 8749 UPDATE variants 8750 SET "INFO" = 8751 concat( 8752 CASE 8753 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8754 THEN '' 8755 ELSE concat("INFO", ';') 8756 END, 8757 CASE 8758 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8759 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8760 THEN concat( 8761 '{snpeff_hgvs}=', 8762 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8763 ) 8764 ELSE '' 8765 END 8766 ) 8767 FROM dataframe_snpeff_hgvs 8768 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8769 8770 """ 8771 self.conn.execute(sql_update) 8772 8773 # Delete dataframe 8774 del dataframe_snpeff_hgvs 8775 gc.collect() 8776 8777 else: 8778 8779 log.warning( 8780 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8781 ) 8782 8783 # Remove added columns 8784 for added_column in added_columns: 8785 self.drop_column(column=added_column) 8786 8787 def calculation_snpeff_ann_explode( 8788 self, 8789 uniquify: bool = True, 8790 output_format: str = "fields", 8791 output_prefix: str = "snpeff_", 8792 snpeff_field: str = "ANN", 8793 ) -> None: 8794 """ 8795 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8796 exploding the HGVS field and updating variant information accordingly. 8797 8798 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8799 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8800 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8801 defaults to True 8802 :type uniquify: bool (optional) 8803 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8804 function specifies the format in which the output annotations will be generated. It has a 8805 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8806 format, defaults to fields 8807 :type output_format: str (optional) 8808 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8809 method is used to specify the prefix that will be added to the output annotations generated 8810 during the calculation process. This prefix helps to differentiate the newly added annotations 8811 from existing ones in the output data. By default, the, defaults to ANN_ 8812 :type output_prefix: str (optional) 8813 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8814 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8815 field will be processed to explode the HGVS annotations and update the variant information 8816 accordingly, defaults to ANN 8817 :type snpeff_field: str (optional) 8818 """ 8819 8820 # SnpEff annotation field 8821 snpeff_hgvs = "snpeff_ann_explode" 8822 8823 # Snpeff hgvs tags 8824 vcf_infos_tags = { 8825 snpeff_hgvs: "Explode snpEff annotations", 8826 } 8827 8828 # Prefix 8829 prefix = self.get_explode_infos_prefix() 8830 if prefix: 8831 prefix = "INFO/" 8832 8833 # snpEff fields 8834 speff_ann_infos = prefix + snpeff_field 8835 speff_hgvs_infos = prefix + snpeff_hgvs 8836 8837 # Variants table 8838 table_variants = self.get_table_variants() 8839 8840 # Header 8841 vcf_reader = self.get_header() 8842 8843 # Add columns 8844 added_columns = [] 8845 8846 # Explode HGVS field in column 8847 added_columns += self.explode_infos(fields=[snpeff_field]) 8848 log.debug(f"snpeff_field={snpeff_field}") 8849 log.debug(f"added_columns={added_columns}") 8850 8851 if snpeff_field in vcf_reader.infos: 8852 8853 # Extract ANN header 8854 ann_description = vcf_reader.infos[snpeff_field].desc 8855 pattern = r"'(.+?)'" 8856 match = re.search(pattern, ann_description) 8857 if match: 8858 ann_header_match = match.group(1).split(" | ") 8859 ann_header = [] 8860 ann_header_desc = {} 8861 for i in range(len(ann_header_match)): 8862 ann_header_info = "".join( 8863 char for char in ann_header_match[i] if char.isalnum() 8864 ) 8865 ann_header.append(ann_header_info) 8866 ann_header_desc[ann_header_info] = ann_header_match[i] 8867 if not ann_header_desc: 8868 raise ValueError("Invalid header description format") 8869 else: 8870 raise ValueError("Invalid header description format") 8871 8872 # Create variant id 8873 variant_id_column = self.get_variant_id_column() 8874 added_columns += [variant_id_column] 8875 8876 # Create dataframe 8877 dataframe_snpeff_hgvs = self.get_query_to_df( 8878 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8879 ) 8880 8881 # Create snpEff columns 8882 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8883 speff_ann_infos 8884 ].apply( 8885 lambda x: explode_snpeff_ann( 8886 str(x), 8887 uniquify=uniquify, 8888 output_format=output_format, 8889 prefix=output_prefix, 8890 header=list(ann_header_desc.values()), 8891 ) 8892 ) 8893 8894 # Header 8895 ann_annotations_prefix = "" 8896 if output_format.upper() in ["JSON"]: 8897 ann_annotations_prefix = f"{output_prefix}=" 8898 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8899 output_prefix, 8900 ".", 8901 "String", 8902 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8903 + " - JSON format", 8904 "howard calculation", 8905 "0", 8906 self.code_type_map.get("String"), 8907 ) 8908 else: 8909 for ann_annotation in ann_header: 8910 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8911 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8912 ann_annotation_id, 8913 ".", 8914 "String", 8915 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8916 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8917 "howard calculation", 8918 "0", 8919 self.code_type_map.get("String"), 8920 ) 8921 8922 # Update 8923 sql_update = f""" 8924 UPDATE variants 8925 SET "INFO" = 8926 concat( 8927 CASE 8928 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8929 THEN '' 8930 ELSE concat("INFO", ';') 8931 END, 8932 CASE 8933 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8934 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8935 THEN concat( 8936 '{ann_annotations_prefix}', 8937 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8938 ) 8939 ELSE '' 8940 END 8941 ) 8942 FROM dataframe_snpeff_hgvs 8943 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8944 8945 """ 8946 self.conn.execute(sql_update) 8947 8948 # Delete dataframe 8949 del dataframe_snpeff_hgvs 8950 gc.collect() 8951 8952 else: 8953 8954 log.warning( 8955 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8956 ) 8957 8958 # Remove added columns 8959 for added_column in added_columns: 8960 self.drop_column(column=added_column) 8961 8962 def calculation_extract_nomen(self) -> None: 8963 """ 8964 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8965 """ 8966 8967 # NOMEN field 8968 field_nomen_dict = "NOMEN_DICT" 8969 8970 # NOMEN structure 8971 nomen_dict = { 8972 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8973 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8974 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8975 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8976 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8977 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8978 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8979 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8980 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8981 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8982 } 8983 8984 # Param 8985 param = self.get_param() 8986 8987 # Threads 8988 threads = self.get_threads() 8989 8990 # Prefix 8991 prefix = self.get_explode_infos_prefix() 8992 8993 # Header 8994 vcf_reader = self.get_header() 8995 8996 # Added columns 8997 added_columns = [] 8998 8999 # Get HGVS field 9000 hgvs_field = ( 9001 param.get("calculation", {}) 9002 .get("calculations", {}) 9003 .get("NOMEN", {}) 9004 .get("options", {}) 9005 .get("hgvs_field", "hgvs") 9006 ) 9007 9008 # Get NOMEN pattern 9009 nomen_pattern = ( 9010 param.get("calculation", {}) 9011 .get("calculations", {}) 9012 .get("NOMEN", {}) 9013 .get("options", {}) 9014 .get("pattern", None) 9015 ) 9016 9017 # transcripts list of preference sources 9018 transcripts_sources = {} 9019 9020 # Get transcripts 9021 transcripts_file = ( 9022 param.get("calculation", {}) 9023 .get("calculations", {}) 9024 .get("NOMEN", {}) 9025 .get("options", {}) 9026 .get("transcripts", None) 9027 ) 9028 transcripts_file = full_path(transcripts_file) 9029 if transcripts_file: 9030 if os.path.exists(transcripts_file): 9031 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 9032 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 9033 transcripts_sources["file"] = transcripts_from_file 9034 else: 9035 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 9036 log.error(msg_err) 9037 raise ValueError(msg_err) 9038 9039 # Get transcripts table 9040 transcripts_table = ( 9041 param.get("calculation", {}) 9042 .get("calculations", {}) 9043 .get("NOMEN", {}) 9044 .get("options", {}) 9045 .get("transcripts_table", self.get_table_variants()) 9046 ) 9047 # Get transcripts column 9048 transcripts_column = ( 9049 param.get("calculation", {}) 9050 .get("calculations", {}) 9051 .get("NOMEN", {}) 9052 .get("options", {}) 9053 .get("transcripts_column", None) 9054 ) 9055 9056 if transcripts_table and transcripts_column: 9057 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 9058 # Explode if not exists 9059 added_columns += self.explode_infos( 9060 fields=[transcripts_column], table=transcripts_table 9061 ) 9062 else: 9063 extra_field_transcript = f"NULL" 9064 9065 # Transcripts of preference source order 9066 transcripts_order = ( 9067 param.get("calculation", {}) 9068 .get("calculations", {}) 9069 .get("NOMEN", {}) 9070 .get("options", {}) 9071 .get("transcripts_order", ["column", "file"]) 9072 ) 9073 9074 # Transcripts from file 9075 transcripts = transcripts_sources.get("file", []) 9076 9077 # Explode HGVS field in column 9078 added_columns += self.explode_infos(fields=[hgvs_field]) 9079 9080 # extra infos 9081 extra_infos = self.get_extra_infos() 9082 extra_field = prefix + hgvs_field 9083 9084 if extra_field in extra_infos: 9085 9086 # Create dataframe 9087 dataframe_hgvs = self.get_query_to_df( 9088 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9089 ) 9090 9091 # Transcripts rank 9092 transcripts_rank = { 9093 transcript: rank for rank, transcript in enumerate(transcripts, start=1) 9094 } 9095 transcripts_len = len(transcripts_rank) 9096 9097 # Create main NOMEN column 9098 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9099 lambda x: find_nomen( 9100 hgvs=x.hgvs, 9101 transcript=x.transcript, 9102 transcripts=transcripts_rank, 9103 pattern=nomen_pattern, 9104 transcripts_source_order=transcripts_order, 9105 transcripts_len=transcripts_len, 9106 ), 9107 axis=1, 9108 ) 9109 9110 # Explode NOMEN Structure and create SQL set for update 9111 sql_nomen_fields = [] 9112 for nomen_field in nomen_dict: 9113 9114 # Create VCF header field 9115 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9116 nomen_field, 9117 ".", 9118 "String", 9119 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9120 "howard calculation", 9121 "0", 9122 self.code_type_map.get("String"), 9123 ) 9124 9125 # Add field to SQL query update 9126 sql_nomen_fields.append( 9127 f""" 9128 CASE 9129 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9130 THEN concat( 9131 ';{nomen_field}=', 9132 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9133 ) 9134 ELSE '' 9135 END 9136 """ 9137 ) 9138 9139 # SQL set for update 9140 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9141 9142 # Update 9143 sql_update = f""" 9144 UPDATE variants 9145 SET "INFO" = 9146 concat( 9147 CASE 9148 WHEN "INFO" IS NULL 9149 THEN '' 9150 ELSE "INFO" 9151 END, 9152 {sql_nomen_fields_set} 9153 ) 9154 FROM dataframe_hgvs 9155 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9156 AND variants."POS" = dataframe_hgvs."POS" 9157 AND variants."REF" = dataframe_hgvs."REF" 9158 AND variants."ALT" = dataframe_hgvs."ALT" 9159 """ 9160 self.conn.execute(sql_update) 9161 9162 # Delete dataframe 9163 del dataframe_hgvs 9164 gc.collect() 9165 9166 # Remove added columns 9167 for added_column in added_columns: 9168 self.drop_column(column=added_column) 9169 9170 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9171 """ 9172 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9173 pipeline/sample for a variant and updates the variant information in a VCF file. 9174 9175 :param tag: The `tag` parameter is a string that represents the annotation field for the 9176 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9177 VCF header and to update the corresponding field in the variants table, defaults to 9178 findbypipeline 9179 :type tag: str (optional) 9180 """ 9181 9182 # if FORMAT and samples 9183 if ( 9184 "FORMAT" in self.get_header_columns_as_list() 9185 and self.get_header_sample_list() 9186 ): 9187 9188 # findbypipeline annotation field 9189 findbypipeline_tag = tag 9190 9191 # VCF infos tags 9192 vcf_infos_tags = { 9193 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9194 } 9195 9196 # Prefix 9197 prefix = self.get_explode_infos_prefix() 9198 9199 # Field 9200 findbypipeline_infos = prefix + findbypipeline_tag 9201 9202 # Variants table 9203 table_variants = self.get_table_variants() 9204 9205 # Header 9206 vcf_reader = self.get_header() 9207 9208 # Create variant id 9209 variant_id_column = self.get_variant_id_column() 9210 added_columns = [variant_id_column] 9211 9212 # variant_id, FORMAT and samples 9213 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9214 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9215 ) 9216 9217 # Create dataframe 9218 dataframe_findbypipeline = self.get_query_to_df( 9219 f""" SELECT {samples_fields} FROM {table_variants} """ 9220 ) 9221 9222 # Create findbypipeline column 9223 dataframe_findbypipeline[findbypipeline_infos] = ( 9224 dataframe_findbypipeline.apply( 9225 lambda row: findbypipeline( 9226 row, samples=self.get_header_sample_list() 9227 ), 9228 axis=1, 9229 ) 9230 ) 9231 9232 # Add snpeff_hgvs to header 9233 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9234 findbypipeline_tag, 9235 ".", 9236 "String", 9237 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9238 "howard calculation", 9239 "0", 9240 self.code_type_map.get("String"), 9241 ) 9242 9243 # Update 9244 sql_update = f""" 9245 UPDATE variants 9246 SET "INFO" = 9247 concat( 9248 CASE 9249 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9250 THEN '' 9251 ELSE concat("INFO", ';') 9252 END, 9253 CASE 9254 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9255 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9256 THEN concat( 9257 '{findbypipeline_tag}=', 9258 dataframe_findbypipeline."{findbypipeline_infos}" 9259 ) 9260 ELSE '' 9261 END 9262 ) 9263 FROM dataframe_findbypipeline 9264 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9265 """ 9266 self.conn.execute(sql_update) 9267 9268 # Remove added columns 9269 for added_column in added_columns: 9270 self.drop_column(column=added_column) 9271 9272 # Delete dataframe 9273 del dataframe_findbypipeline 9274 gc.collect() 9275 9276 def calculation_genotype_concordance(self) -> None: 9277 """ 9278 The function `calculation_genotype_concordance` calculates the genotype concordance for 9279 multi-caller VCF files and updates the variant information in the database. 9280 """ 9281 9282 # if FORMAT and samples 9283 if ( 9284 "FORMAT" in self.get_header_columns_as_list() 9285 and self.get_header_sample_list() 9286 ): 9287 9288 # genotypeconcordance annotation field 9289 genotypeconcordance_tag = "genotypeconcordance" 9290 9291 # VCF infos tags 9292 vcf_infos_tags = { 9293 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9294 } 9295 9296 # Prefix 9297 prefix = self.get_explode_infos_prefix() 9298 9299 # Field 9300 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9301 9302 # Variants table 9303 table_variants = self.get_table_variants() 9304 9305 # Header 9306 vcf_reader = self.get_header() 9307 9308 # Create variant id 9309 variant_id_column = self.get_variant_id_column() 9310 added_columns = [variant_id_column] 9311 9312 # variant_id, FORMAT and samples 9313 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9314 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9315 ) 9316 9317 # Create dataframe 9318 dataframe_genotypeconcordance = self.get_query_to_df( 9319 f""" SELECT {samples_fields} FROM {table_variants} """ 9320 ) 9321 9322 # Create genotypeconcordance column 9323 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9324 dataframe_genotypeconcordance.apply( 9325 lambda row: genotypeconcordance( 9326 row, samples=self.get_header_sample_list() 9327 ), 9328 axis=1, 9329 ) 9330 ) 9331 9332 # Add genotypeconcordance to header 9333 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9334 genotypeconcordance_tag, 9335 ".", 9336 "String", 9337 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9338 "howard calculation", 9339 "0", 9340 self.code_type_map.get("String"), 9341 ) 9342 9343 # Update 9344 sql_update = f""" 9345 UPDATE variants 9346 SET "INFO" = 9347 concat( 9348 CASE 9349 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9350 THEN '' 9351 ELSE concat("INFO", ';') 9352 END, 9353 CASE 9354 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9355 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9356 THEN concat( 9357 '{genotypeconcordance_tag}=', 9358 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9359 ) 9360 ELSE '' 9361 END 9362 ) 9363 FROM dataframe_genotypeconcordance 9364 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9365 """ 9366 self.conn.execute(sql_update) 9367 9368 # Remove added columns 9369 for added_column in added_columns: 9370 self.drop_column(column=added_column) 9371 9372 # Delete dataframe 9373 del dataframe_genotypeconcordance 9374 gc.collect() 9375 9376 def calculation_barcode(self, tag: str = "barcode") -> None: 9377 """ 9378 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9379 updates the INFO field in the file with the calculated barcode values. 9380 9381 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9382 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9383 the default tag name is set to "barcode", defaults to barcode 9384 :type tag: str (optional) 9385 """ 9386 9387 # if FORMAT and samples 9388 if ( 9389 "FORMAT" in self.get_header_columns_as_list() 9390 and self.get_header_sample_list() 9391 ): 9392 9393 # barcode annotation field 9394 if not tag: 9395 tag = "barcode" 9396 9397 # VCF infos tags 9398 vcf_infos_tags = { 9399 tag: "barcode calculation (VaRank)", 9400 } 9401 9402 # Prefix 9403 prefix = self.get_explode_infos_prefix() 9404 9405 # Field 9406 barcode_infos = prefix + tag 9407 9408 # Variants table 9409 table_variants = self.get_table_variants() 9410 9411 # Header 9412 vcf_reader = self.get_header() 9413 9414 # Create variant id 9415 variant_id_column = self.get_variant_id_column() 9416 added_columns = [variant_id_column] 9417 9418 # variant_id, FORMAT and samples 9419 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9420 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9421 ) 9422 9423 # Create dataframe 9424 dataframe_barcode = self.get_query_to_df( 9425 f""" SELECT {samples_fields} FROM {table_variants} """ 9426 ) 9427 9428 # Create barcode column 9429 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9430 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9431 ) 9432 9433 # Add barcode to header 9434 vcf_reader.infos[tag] = vcf.parser._Info( 9435 tag, 9436 ".", 9437 "String", 9438 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9439 "howard calculation", 9440 "0", 9441 self.code_type_map.get("String"), 9442 ) 9443 9444 # Update 9445 sql_update = f""" 9446 UPDATE {table_variants} 9447 SET "INFO" = 9448 concat( 9449 CASE 9450 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9451 THEN '' 9452 ELSE concat("INFO", ';') 9453 END, 9454 CASE 9455 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9456 AND dataframe_barcode."{barcode_infos}" NOT NULL 9457 THEN concat( 9458 '{tag}=', 9459 dataframe_barcode."{barcode_infos}" 9460 ) 9461 ELSE '' 9462 END 9463 ) 9464 FROM dataframe_barcode 9465 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9466 """ 9467 self.conn.execute(sql_update) 9468 9469 # Remove added columns 9470 for added_column in added_columns: 9471 self.drop_column(column=added_column) 9472 9473 # Delete dataframe 9474 del dataframe_barcode 9475 gc.collect() 9476 9477 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9478 """ 9479 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9480 and updates the INFO field in the file with the calculated barcode values. 9481 9482 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9483 the barcode tag that will be added to the VCF file during the calculation process. If no value 9484 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9485 :type tag: str (optional) 9486 """ 9487 9488 # if FORMAT and samples 9489 if ( 9490 "FORMAT" in self.get_header_columns_as_list() 9491 and self.get_header_sample_list() 9492 ): 9493 9494 # barcode annotation field 9495 if not tag: 9496 tag = "BCF" 9497 9498 # VCF infos tags 9499 vcf_infos_tags = { 9500 tag: "barcode family calculation", 9501 f"{tag}S": "barcode family samples", 9502 } 9503 9504 # Param 9505 param = self.get_param() 9506 log.debug(f"param={param}") 9507 9508 # Prefix 9509 prefix = self.get_explode_infos_prefix() 9510 9511 # PED param 9512 ped = ( 9513 param.get("calculation", {}) 9514 .get("calculations", {}) 9515 .get("BARCODEFAMILY", {}) 9516 .get("family_pedigree", None) 9517 ) 9518 log.debug(f"ped={ped}") 9519 9520 # Load PED 9521 if ped: 9522 9523 # Pedigree is a file 9524 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9525 log.debug("Pedigree is file") 9526 with open(full_path(ped)) as ped: 9527 ped = yaml.safe_load(ped) 9528 9529 # Pedigree is a string 9530 elif isinstance(ped, str): 9531 log.debug("Pedigree is str") 9532 try: 9533 ped = json.loads(ped) 9534 log.debug("Pedigree is json str") 9535 except ValueError as e: 9536 ped_samples = ped.split(",") 9537 ped = {} 9538 for ped_sample in ped_samples: 9539 ped[ped_sample] = ped_sample 9540 9541 # Pedigree is a dict 9542 elif isinstance(ped, dict): 9543 log.debug("Pedigree is dict") 9544 9545 # Pedigree is not well formatted 9546 else: 9547 msg_error = "Pedigree not well formatted" 9548 log.error(msg_error) 9549 raise ValueError(msg_error) 9550 9551 # Construct list 9552 ped_samples = list(ped.values()) 9553 9554 else: 9555 log.debug("Pedigree not defined. Take all samples") 9556 ped_samples = self.get_header_sample_list() 9557 ped = {} 9558 for ped_sample in ped_samples: 9559 ped[ped_sample] = ped_sample 9560 9561 # Check pedigree 9562 if not ped or len(ped) == 0: 9563 msg_error = f"Error in pedigree: samples {ped_samples}" 9564 log.error(msg_error) 9565 raise ValueError(msg_error) 9566 9567 # Log 9568 log.info( 9569 "Calculation 'BARCODEFAMILY' - Samples: " 9570 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9571 ) 9572 log.debug(f"ped_samples={ped_samples}") 9573 9574 # Field 9575 barcode_infos = prefix + tag 9576 9577 # Variants table 9578 table_variants = self.get_table_variants() 9579 9580 # Header 9581 vcf_reader = self.get_header() 9582 9583 # Create variant id 9584 variant_id_column = self.get_variant_id_column() 9585 added_columns = [variant_id_column] 9586 9587 # variant_id, FORMAT and samples 9588 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9589 [f""" "{sample}" """ for sample in ped_samples] 9590 ) 9591 9592 # Create dataframe 9593 dataframe_barcode = self.get_query_to_df( 9594 f""" SELECT {samples_fields} FROM {table_variants} """ 9595 ) 9596 9597 # Create barcode column 9598 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9599 lambda row: barcode(row, samples=ped_samples), axis=1 9600 ) 9601 9602 # Add barcode family to header 9603 # Add vaf_normalization to header 9604 vcf_reader.formats[tag] = vcf.parser._Format( 9605 id=tag, 9606 num=".", 9607 type="String", 9608 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9609 type_code=self.code_type_map.get("String"), 9610 ) 9611 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9612 id=f"{tag}S", 9613 num=".", 9614 type="String", 9615 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9616 type_code=self.code_type_map.get("String"), 9617 ) 9618 9619 # Update 9620 # for sample in ped_samples: 9621 sql_update_set = [] 9622 for sample in self.get_header_sample_list() + ["FORMAT"]: 9623 if sample in ped_samples: 9624 value = f'dataframe_barcode."{barcode_infos}"' 9625 value_samples = ( 9626 "'" 9627 + ",".join([f""" "{sample}" """ for sample in ped_samples]) 9628 + "'" 9629 ) 9630 ped_samples 9631 elif sample == "FORMAT": 9632 value = f"'{tag}'" 9633 value_samples = f"'{tag}S'" 9634 else: 9635 value = "'.'" 9636 value_samples = "'.'" 9637 format_regex = r"[a-zA-Z0-9\s]" 9638 sql_update_set.append( 9639 f""" 9640 "{sample}" = 9641 concat( 9642 CASE 9643 WHEN {table_variants}."{sample}" = './.' 9644 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9645 ELSE {table_variants}."{sample}" 9646 END, 9647 ':', 9648 {value}, 9649 ':', 9650 {value_samples} 9651 ) 9652 """ 9653 ) 9654 9655 sql_update_set_join = ", ".join(sql_update_set) 9656 sql_update = f""" 9657 UPDATE {table_variants} 9658 SET {sql_update_set_join} 9659 FROM dataframe_barcode 9660 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9661 """ 9662 self.conn.execute(sql_update) 9663 9664 # Remove added columns 9665 for added_column in added_columns: 9666 self.drop_column(column=added_column) 9667 9668 # Delete dataframe 9669 del dataframe_barcode 9670 gc.collect() 9671 9672 def calculation_trio(self) -> None: 9673 """ 9674 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9675 information to the INFO field of each variant. 9676 """ 9677 9678 # if FORMAT and samples 9679 if ( 9680 "FORMAT" in self.get_header_columns_as_list() 9681 and self.get_header_sample_list() 9682 ): 9683 9684 # trio annotation field 9685 trio_tag = "trio" 9686 9687 # VCF infos tags 9688 vcf_infos_tags = { 9689 "trio": "trio calculation", 9690 } 9691 9692 # Param 9693 param = self.get_param() 9694 9695 # Prefix 9696 prefix = self.get_explode_infos_prefix() 9697 9698 # Trio param 9699 trio_ped = ( 9700 param.get("calculation", {}) 9701 .get("calculations", {}) 9702 .get("TRIO", {}) 9703 .get("trio_pedigree", None) 9704 ) 9705 9706 # Load trio 9707 if trio_ped: 9708 9709 # Trio pedigree is a file 9710 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9711 log.debug("TRIO pedigree is file") 9712 with open(full_path(trio_ped)) as trio_ped: 9713 trio_ped = yaml.safe_load(trio_ped) 9714 9715 # Trio pedigree is a string 9716 elif isinstance(trio_ped, str): 9717 log.debug("TRIO pedigree is str") 9718 try: 9719 trio_ped = json.loads(trio_ped) 9720 log.debug("TRIO pedigree is json str") 9721 except ValueError as e: 9722 trio_samples = trio_ped.split(",") 9723 if len(trio_samples) == 3: 9724 trio_ped = { 9725 "father": trio_samples[0], 9726 "mother": trio_samples[1], 9727 "child": trio_samples[2], 9728 } 9729 log.debug("TRIO pedigree is list str") 9730 else: 9731 msg_error = "TRIO pedigree not well formatted" 9732 log.error(msg_error) 9733 raise ValueError(msg_error) 9734 9735 # Trio pedigree is a dict 9736 elif isinstance(trio_ped, dict): 9737 log.debug("TRIO pedigree is dict") 9738 9739 # Trio pedigree is not well formatted 9740 else: 9741 msg_error = "TRIO pedigree not well formatted" 9742 log.error(msg_error) 9743 raise ValueError(msg_error) 9744 9745 # Construct trio list 9746 trio_samples = [ 9747 trio_ped.get("father", ""), 9748 trio_ped.get("mother", ""), 9749 trio_ped.get("child", ""), 9750 ] 9751 9752 else: 9753 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9754 samples_list = self.get_header_sample_list() 9755 if len(samples_list) >= 3: 9756 trio_samples = self.get_header_sample_list()[0:3] 9757 trio_ped = { 9758 "father": trio_samples[0], 9759 "mother": trio_samples[1], 9760 "child": trio_samples[2], 9761 } 9762 else: 9763 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9764 log.error(msg_error) 9765 raise ValueError(msg_error) 9766 9767 # Check trio pedigree 9768 if not trio_ped or len(trio_ped) != 3: 9769 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9770 log.error(msg_error) 9771 raise ValueError(msg_error) 9772 9773 # Log 9774 log.info( 9775 f"Calculation 'TRIO' - Samples: " 9776 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9777 ) 9778 9779 # Field 9780 trio_infos = prefix + trio_tag 9781 9782 # Variants table 9783 table_variants = self.get_table_variants() 9784 9785 # Header 9786 vcf_reader = self.get_header() 9787 9788 # Create variant id 9789 variant_id_column = self.get_variant_id_column() 9790 added_columns = [variant_id_column] 9791 9792 # variant_id, FORMAT and samples 9793 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9794 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9795 ) 9796 9797 # Create dataframe 9798 dataframe_trio = self.get_query_to_df( 9799 f""" SELECT {samples_fields} FROM {table_variants} """ 9800 ) 9801 9802 # Create trio column 9803 dataframe_trio[trio_infos] = dataframe_trio.apply( 9804 lambda row: trio(row, samples=trio_samples), axis=1 9805 ) 9806 9807 # Add trio to header 9808 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9809 trio_tag, 9810 ".", 9811 "String", 9812 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9813 "howard calculation", 9814 "0", 9815 self.code_type_map.get("String"), 9816 ) 9817 9818 # Update 9819 sql_update = f""" 9820 UPDATE {table_variants} 9821 SET "INFO" = 9822 concat( 9823 CASE 9824 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9825 THEN '' 9826 ELSE concat("INFO", ';') 9827 END, 9828 CASE 9829 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9830 AND dataframe_trio."{trio_infos}" NOT NULL 9831 THEN concat( 9832 '{trio_tag}=', 9833 dataframe_trio."{trio_infos}" 9834 ) 9835 ELSE '' 9836 END 9837 ) 9838 FROM dataframe_trio 9839 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9840 """ 9841 self.conn.execute(sql_update) 9842 9843 # Remove added columns 9844 for added_column in added_columns: 9845 self.drop_column(column=added_column) 9846 9847 # Delete dataframe 9848 del dataframe_trio 9849 gc.collect() 9850 9851 def calculation_vaf_normalization(self) -> None: 9852 """ 9853 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9854 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9855 :return: The function does not return anything. 9856 """ 9857 9858 # if FORMAT and samples 9859 if ( 9860 "FORMAT" in self.get_header_columns_as_list() 9861 and self.get_header_sample_list() 9862 ): 9863 9864 # vaf_normalization annotation field 9865 vaf_normalization_tag = "VAF" 9866 9867 # VCF infos tags 9868 vcf_infos_tags = { 9869 "VAF": "VAF Variant Frequency", 9870 } 9871 9872 # Prefix 9873 prefix = self.get_explode_infos_prefix() 9874 9875 # Variants table 9876 table_variants = self.get_table_variants() 9877 9878 # Header 9879 vcf_reader = self.get_header() 9880 9881 # Do not calculate if VAF already exists 9882 if "VAF" in vcf_reader.formats: 9883 log.debug("VAF already on genotypes") 9884 return 9885 9886 # Create variant id 9887 variant_id_column = self.get_variant_id_column() 9888 added_columns = [variant_id_column] 9889 9890 # variant_id, FORMAT and samples 9891 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9892 f""" "{sample}" """ for sample in self.get_header_sample_list() 9893 ) 9894 9895 # Create dataframe 9896 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9897 log.debug(f"query={query}") 9898 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9899 9900 vaf_normalization_set = [] 9901 9902 # for each sample vaf_normalization 9903 for sample in self.get_header_sample_list(): 9904 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9905 lambda row: vaf_normalization(row, sample=sample), axis=1 9906 ) 9907 vaf_normalization_set.append( 9908 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9909 ) 9910 9911 # Add VAF to FORMAT 9912 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9913 "FORMAT" 9914 ].apply(lambda x: str(x) + ":VAF") 9915 vaf_normalization_set.append( 9916 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9917 ) 9918 9919 # Add vaf_normalization to header 9920 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9921 id=vaf_normalization_tag, 9922 num="1", 9923 type="Float", 9924 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9925 type_code=self.code_type_map.get("Float"), 9926 ) 9927 9928 # Create fields to add in INFO 9929 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9930 9931 # Update 9932 sql_update = f""" 9933 UPDATE {table_variants} 9934 SET {sql_vaf_normalization_set} 9935 FROM dataframe_vaf_normalization 9936 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9937 9938 """ 9939 self.conn.execute(sql_update) 9940 9941 # Remove added columns 9942 for added_column in added_columns: 9943 self.drop_column(column=added_column) 9944 9945 # Delete dataframe 9946 del dataframe_vaf_normalization 9947 gc.collect() 9948 9949 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9950 """ 9951 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9952 field in a VCF file and updates the INFO column of the variants table with the calculated 9953 statistics. 9954 9955 :param info: The `info` parameter is a string that represents the type of information for which 9956 genotype statistics are calculated. It is used to generate various VCF info tags for the 9957 statistics, such as the number of occurrences, the list of values, the minimum value, the 9958 maximum value, the mean, the median, defaults to VAF 9959 :type info: str (optional) 9960 """ 9961 9962 # if FORMAT and samples 9963 if ( 9964 "FORMAT" in self.get_header_columns_as_list() 9965 and self.get_header_sample_list() 9966 ): 9967 9968 # vaf_stats annotation field 9969 vaf_stats_tag = info + "_stats" 9970 9971 # VCF infos tags 9972 vcf_infos_tags = { 9973 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9974 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9975 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9976 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9977 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9978 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9979 info 9980 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9981 } 9982 9983 # Prefix 9984 prefix = self.get_explode_infos_prefix() 9985 9986 # Field 9987 vaf_stats_infos = prefix + vaf_stats_tag 9988 9989 # Variants table 9990 table_variants = self.get_table_variants() 9991 9992 # Header 9993 vcf_reader = self.get_header() 9994 9995 # Create variant id 9996 variant_id_column = self.get_variant_id_column() 9997 added_columns = [variant_id_column] 9998 9999 # variant_id, FORMAT and samples 10000 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 10001 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 10002 ) 10003 10004 # Create dataframe 10005 dataframe_vaf_stats = self.get_query_to_df( 10006 f""" SELECT {samples_fields} FROM {table_variants} """ 10007 ) 10008 10009 # Create vaf_stats column 10010 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 10011 lambda row: genotype_stats( 10012 row, samples=self.get_header_sample_list(), info=info 10013 ), 10014 axis=1, 10015 ) 10016 10017 # List of vcf tags 10018 sql_vaf_stats_fields = [] 10019 10020 # Check all VAF stats infos 10021 for stat in vcf_infos_tags: 10022 10023 # Extract stats 10024 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 10025 lambda x: dict(x).get(stat, "") 10026 ) 10027 10028 # Add snpeff_hgvs to header 10029 vcf_reader.infos[stat] = vcf.parser._Info( 10030 stat, 10031 ".", 10032 "String", 10033 vcf_infos_tags.get(stat, "genotype statistics"), 10034 "howard calculation", 10035 "0", 10036 self.code_type_map.get("String"), 10037 ) 10038 10039 if len(sql_vaf_stats_fields): 10040 sep = ";" 10041 else: 10042 sep = "" 10043 10044 # Create fields to add in INFO 10045 sql_vaf_stats_fields.append( 10046 f""" 10047 CASE 10048 WHEN dataframe_vaf_stats."{stat}" NOT NULL 10049 THEN concat( 10050 '{sep}{stat}=', 10051 dataframe_vaf_stats."{stat}" 10052 ) 10053 ELSE '' 10054 END 10055 """ 10056 ) 10057 10058 # SQL set for update 10059 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 10060 10061 # Update 10062 sql_update = f""" 10063 UPDATE {table_variants} 10064 SET "INFO" = 10065 concat( 10066 CASE 10067 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10068 THEN '' 10069 ELSE concat("INFO", ';') 10070 END, 10071 {sql_vaf_stats_fields_set} 10072 ) 10073 FROM dataframe_vaf_stats 10074 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 10075 10076 """ 10077 self.conn.execute(sql_update) 10078 10079 # Remove added columns 10080 for added_column in added_columns: 10081 self.drop_column(column=added_column) 10082 10083 # Delete dataframe 10084 del dataframe_vaf_stats 10085 gc.collect() 10086 10087 def calculation_transcripts_annotation( 10088 self, info_json: str = None, info_format: str = None 10089 ) -> None: 10090 """ 10091 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10092 field to it if transcripts are available. 10093 10094 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10095 is a string parameter that represents the information field to be used in the transcripts JSON. 10096 It is used to specify the JSON format for the transcripts information. If no value is provided 10097 when calling the method, it defaults to " 10098 :type info_json: str 10099 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10100 method is a string parameter that specifies the format of the information field to be used in 10101 the transcripts JSON. It is used to define the format of the information field 10102 :type info_format: str 10103 """ 10104 10105 # Create transcripts table 10106 transcripts_table = self.create_transcript_view() 10107 10108 # Add info field 10109 if transcripts_table: 10110 self.transcript_view_to_variants( 10111 transcripts_table=transcripts_table, 10112 transcripts_info_field_json=info_json, 10113 transcripts_info_field_format=info_format, 10114 ) 10115 else: 10116 log.info("No Transcripts to process. Check param.json file configuration") 10117 10118 def calculation_transcripts_prioritization(self) -> None: 10119 """ 10120 The function `calculation_transcripts_prioritization` creates a transcripts table and 10121 prioritizes transcripts based on certain criteria. 10122 """ 10123 10124 # Create transcripts table 10125 transcripts_table = self.create_transcript_view() 10126 10127 # Add info field 10128 if transcripts_table: 10129 self.transcripts_prioritization(transcripts_table=transcripts_table) 10130 else: 10131 log.info("No Transcripts to process. Check param.json file configuration") 10132 10133 def calculation_transcripts_export(self) -> None: 10134 """ """ 10135 10136 # Create transcripts table 10137 transcripts_table = self.create_transcript_view() 10138 10139 # Add info field 10140 if transcripts_table: 10141 self.transcripts_export(transcripts_table=transcripts_table) 10142 else: 10143 log.info("No Transcripts to process. Check param.json file configuration") 10144 10145 ############### 10146 # Transcripts # 10147 ############### 10148 10149 def transcripts_export( 10150 self, transcripts_table: str = None, param: dict = {} 10151 ) -> bool: 10152 """ """ 10153 10154 log.debug("Start transcripts export...") 10155 10156 # Param 10157 if not param: 10158 param = self.get_param() 10159 10160 # Param export 10161 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10162 10163 # Output file 10164 transcripts_export_output = param_transcript_export.get("output", None) 10165 10166 if not param_transcript_export or not transcripts_export_output: 10167 log.warning(f"No transcriipts export parameters defined!") 10168 return False 10169 10170 # List of transcripts annotations 10171 query_describe = f""" 10172 SELECT column_name 10173 FROM ( 10174 DESCRIBE SELECT * FROM {transcripts_table} 10175 ) 10176 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10177 """ 10178 transcripts_annotations_list = list( 10179 self.get_query_to_df(query=query_describe)["column_name"] 10180 ) 10181 10182 # Create transcripts table for export 10183 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10184 random.choices(string.ascii_uppercase + string.digits, k=10) 10185 ) 10186 query_create_transcripts_table_export = f""" 10187 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10188 """ 10189 self.execute_query(query=query_create_transcripts_table_export) 10190 10191 # Output file format 10192 transcripts_export_output_format = get_file_format( 10193 filename=transcripts_export_output 10194 ) 10195 10196 # Format VCF - construct INFO 10197 if transcripts_export_output_format in ["vcf"]: 10198 10199 # Construct query update INFO and header 10200 query_update_info = [] 10201 for field in transcripts_annotations_list: 10202 10203 # If field not in header 10204 if field not in self.get_header_infos_list(): 10205 10206 # Add PZ Transcript in header 10207 self.get_header().infos[field] = vcf.parser._Info( 10208 field, 10209 ".", 10210 "String", 10211 f"Annotation '{field}' from transcript view", 10212 "unknown", 10213 "unknown", 10214 0, 10215 ) 10216 10217 # Add field as INFO/tag 10218 query_update_info.append( 10219 f""" 10220 CASE 10221 WHEN "{field}" IS NOT NULL 10222 THEN concat('{field}=', "{field}", ';') 10223 ELSE '' 10224 END 10225 """ 10226 ) 10227 10228 # Query param 10229 query_update_info_value = ( 10230 f""" concat('', {", ".join(query_update_info)}) """ 10231 ) 10232 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10233 10234 else: 10235 10236 # Query param 10237 query_update_info_value = f""" NULL """ 10238 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10239 10240 # Update query INFO column 10241 query_update = f""" 10242 UPDATE {transcripts_table_export} 10243 SET INFO = {query_update_info_value} 10244 10245 """ 10246 self.execute_query(query=query_update) 10247 10248 # Export 10249 self.export_output( 10250 output_file=transcripts_export_output, 10251 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10252 ) 10253 10254 # Drop transcripts export table 10255 query_drop_transcripts_table_export = f""" 10256 DROP TABLE {transcripts_table_export} 10257 """ 10258 self.execute_query(query=query_drop_transcripts_table_export) 10259 10260 def transcripts_prioritization( 10261 self, transcripts_table: str = None, param: dict = {} 10262 ) -> bool: 10263 """ 10264 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10265 and updates the variants table with the prioritized information. 10266 10267 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10268 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10269 This parameter is used to identify the table where the transcripts data is stored for the 10270 prioritization process 10271 :type transcripts_table: str 10272 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10273 that contains various configuration settings for the prioritization process of transcripts. It 10274 is used to customize the behavior of the prioritization algorithm and includes settings such as 10275 the prefix for prioritization fields, default profiles, and other 10276 :type param: dict 10277 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10278 transcripts prioritization process is successfully completed, and `False` if there are any 10279 issues or if no profile is defined for transcripts prioritization. 10280 """ 10281 10282 log.debug("Start transcripts prioritization...") 10283 10284 # Param 10285 if not param: 10286 param = self.get_param() 10287 10288 # Variants table 10289 table_variants = self.get_table_variants() 10290 10291 # Transcripts table 10292 if transcripts_table is None: 10293 transcripts_table = self.create_transcript_view( 10294 transcripts_table="transcripts", param=param 10295 ) 10296 if transcripts_table is None: 10297 msg_err = "No Transcripts table availalble" 10298 log.error(msg_err) 10299 raise ValueError(msg_err) 10300 log.debug(f"transcripts_table={transcripts_table}") 10301 10302 # Get transcripts columns 10303 columns_as_list_query = f""" 10304 DESCRIBE {transcripts_table} 10305 """ 10306 columns_as_list = list( 10307 self.get_query_to_df(columns_as_list_query)["column_name"] 10308 ) 10309 10310 # Create INFO if not exists 10311 if "INFO" not in columns_as_list: 10312 query_add_info = f""" 10313 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10314 """ 10315 self.execute_query(query_add_info) 10316 10317 # Prioritization param and Force only PZ Score and Flag 10318 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10319 10320 # PZ profile by default 10321 pz_profile_default = ( 10322 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10323 ) 10324 10325 # Exit if no profile 10326 if pz_profile_default is None: 10327 log.warning("No profile defined for transcripts prioritization") 10328 return False 10329 10330 # PZ fields 10331 pz_param_pzfields = {} 10332 10333 # PZ field transcripts 10334 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10335 10336 # Add PZ Transcript in header 10337 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10338 pz_fields_transcripts, 10339 ".", 10340 "String", 10341 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10342 "unknown", 10343 "unknown", 10344 code_type_map["String"], 10345 ) 10346 10347 # Mandatory fields if asked in param 10348 pz_mandatory_fields_list = [ 10349 "Score", 10350 "Flag", 10351 "Tags", 10352 "Comment", 10353 "Infos", 10354 "Class", 10355 ] 10356 pz_mandatory_fields = [] 10357 for pz_mandatory_field in pz_mandatory_fields_list: 10358 pz_mandatory_fields.append( 10359 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10360 ) 10361 10362 # PZ fields in param 10363 pz_param_mandatory_fields = [] 10364 for pz_field in pz_param.get("pzfields", []): 10365 if pz_field in pz_mandatory_fields_list: 10366 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10367 pz_param.get("pzprefix", "PTZ") + pz_field 10368 ) 10369 pz_param_mandatory_fields.append( 10370 pz_param.get("pzprefix", "PTZ") + pz_field 10371 ) 10372 else: 10373 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10374 pz_param_pzfields[pz_field] = pz_field_new 10375 10376 # Add PZ Transcript in header 10377 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10378 pz_field_new, 10379 ".", 10380 "String", 10381 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10382 "unknown", 10383 "unknown", 10384 code_type_map["String"], 10385 ) 10386 10387 # PZ fields param 10388 pz_mandatory_fields = pz_param_mandatory_fields 10389 pz_param["pzfields"] = pz_mandatory_fields 10390 10391 # Prioritization 10392 prioritization_result = self.prioritization( 10393 table=transcripts_table, 10394 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10395 ) 10396 if not prioritization_result: 10397 log.warning("Transcripts prioritization not processed") 10398 return False 10399 10400 # PZ fields sql query 10401 query_update_select_list = [] 10402 query_update_concat_list = [] 10403 query_update_order_list = [] 10404 for pz_param_pzfield in set( 10405 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10406 ): 10407 query_update_select_list.append(f" {pz_param_pzfield}, ") 10408 10409 for pz_param_pzfield in pz_param_pzfields: 10410 query_update_concat_list.append( 10411 f""" 10412 , CASE 10413 WHEN {pz_param_pzfield} IS NOT NULL 10414 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10415 ELSE '' 10416 END 10417 """ 10418 ) 10419 10420 # Order by 10421 pz_orders = ( 10422 param.get("transcripts", {}) 10423 .get("prioritization", {}) 10424 .get("prioritization_transcripts_order", {}) 10425 ) 10426 if not pz_orders: 10427 pz_orders = { 10428 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10429 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10430 } 10431 for pz_order in pz_orders: 10432 query_update_order_list.append( 10433 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10434 ) 10435 10436 # Fields to explode 10437 fields_to_explode = ( 10438 list(pz_param_pzfields.keys()) 10439 + pz_mandatory_fields 10440 + list(pz_orders.keys()) 10441 ) 10442 # Remove transcript column as a specific transcript column 10443 if "transcript" in fields_to_explode: 10444 fields_to_explode.remove("transcript") 10445 10446 # Fields intranscripts table 10447 query_transcripts_table = f""" 10448 DESCRIBE SELECT * FROM {transcripts_table} 10449 """ 10450 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10451 10452 # Check fields to explode 10453 for field_to_explode in fields_to_explode: 10454 if field_to_explode not in self.get_header_infos_list() + list( 10455 query_transcripts_table.column_name 10456 ): 10457 msg_err = f"INFO/{field_to_explode} NOT IN header" 10458 log.error(msg_err) 10459 raise ValueError(msg_err) 10460 10461 # Explode fields to explode 10462 self.explode_infos( 10463 table=transcripts_table, 10464 fields=fields_to_explode, 10465 ) 10466 10467 # Transcript preference file 10468 transcripts_preference_file = ( 10469 param.get("transcripts", {}) 10470 .get("prioritization", {}) 10471 .get("prioritization_transcripts", {}) 10472 ) 10473 transcripts_preference_file = full_path(transcripts_preference_file) 10474 10475 # Transcript preference forced 10476 transcript_preference_force = ( 10477 param.get("transcripts", {}) 10478 .get("prioritization", {}) 10479 .get("prioritization_transcripts_force", False) 10480 ) 10481 # Transcript version forced 10482 transcript_version_force = ( 10483 param.get("transcripts", {}) 10484 .get("prioritization", {}) 10485 .get("prioritization_transcripts_version_force", False) 10486 ) 10487 10488 # Transcripts Ranking 10489 if transcripts_preference_file: 10490 10491 # Transcripts file to dataframe 10492 if os.path.exists(transcripts_preference_file): 10493 transcripts_preference_dataframe = transcripts_file_to_df( 10494 transcripts_preference_file 10495 ) 10496 else: 10497 log.error( 10498 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10499 ) 10500 raise ValueError( 10501 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10502 ) 10503 10504 # Order by depending to transcript preference forcing 10505 if transcript_preference_force: 10506 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10507 else: 10508 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10509 10510 # Transcript columns joined depend on version consideration 10511 if transcript_version_force: 10512 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10513 else: 10514 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10515 10516 # Query ranking for update 10517 query_update_ranking = f""" 10518 SELECT 10519 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10520 ROW_NUMBER() OVER ( 10521 PARTITION BY "#CHROM", POS, REF, ALT 10522 ORDER BY {order_by} 10523 ) AS rn 10524 FROM {transcripts_table} 10525 LEFT JOIN 10526 ( 10527 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10528 FROM transcripts_preference_dataframe 10529 ) AS transcripts_preference 10530 ON {transcripts_version_join} 10531 """ 10532 10533 else: 10534 10535 # Query ranking for update 10536 query_update_ranking = f""" 10537 SELECT 10538 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10539 ROW_NUMBER() OVER ( 10540 PARTITION BY "#CHROM", POS, REF, ALT 10541 ORDER BY {" , ".join(query_update_order_list)} 10542 ) AS rn 10543 FROM {transcripts_table} 10544 """ 10545 10546 # Export Transcripts prioritization infos to variants table 10547 query_update = f""" 10548 WITH RankedTranscripts AS ( 10549 {query_update_ranking} 10550 ) 10551 UPDATE {table_variants} 10552 SET 10553 INFO = CONCAT(CASE 10554 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10555 THEN '' 10556 ELSE concat("INFO", ';') 10557 END, 10558 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10559 ) 10560 FROM 10561 RankedTranscripts 10562 WHERE 10563 rn = 1 10564 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10565 AND variants."POS" = RankedTranscripts."POS" 10566 AND variants."REF" = RankedTranscripts."REF" 10567 AND variants."ALT" = RankedTranscripts."ALT" 10568 """ 10569 10570 # log.debug(f"query_update={query_update}") 10571 self.execute_query(query=query_update) 10572 10573 # Return 10574 return True 10575 10576 def create_transcript_view_from_columns_map( 10577 self, 10578 transcripts_table: str = "transcripts", 10579 columns_maps: dict = {}, 10580 added_columns: list = [], 10581 temporary_tables: list = None, 10582 annotation_fields: list = None, 10583 column_rename: dict = {}, 10584 column_clean: bool = False, 10585 column_case: str = None, 10586 ) -> tuple[list, list, list]: 10587 """ 10588 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10589 specified columns mapping for transcripts data. 10590 10591 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10592 of the table where the transcripts data is stored or will be stored in the database. This table 10593 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10594 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10595 :type transcripts_table: str (optional) 10596 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10597 about how to map columns from a transcripts table to create a view. Each entry in the 10598 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10599 typically includes details such as the main transcript column and additional information columns 10600 :type columns_maps: dict 10601 :param added_columns: The `added_columns` parameter in the 10602 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10603 that will be added to the view being created based on the columns map provided. These columns 10604 are generated by exploding the transcript information columns along with the main transcript 10605 column 10606 :type added_columns: list 10607 :param temporary_tables: The `temporary_tables` parameter in the 10608 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10609 tables created during the process of creating a transcript view from a columns map. These 10610 temporary tables are used to store intermediate results or transformations before the final view 10611 is generated 10612 :type temporary_tables: list 10613 :param annotation_fields: The `annotation_fields` parameter in the 10614 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10615 used for annotation in the query view creation process. These fields are extracted from the 10616 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10617 :type annotation_fields: list 10618 :param column_rename: The `column_rename` parameter in the 10619 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10620 custom renaming for columns during the creation of the temporary table view. This parameter 10621 provides a mapping of original column names to the desired renamed column names. By using this 10622 parameter, 10623 :type column_rename: dict 10624 :param column_clean: The `column_clean` parameter in the 10625 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10626 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10627 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10628 False 10629 :type column_clean: bool (optional) 10630 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10631 function is used to specify the case transformation to be applied to the columns during the view 10632 creation process. It allows you to control whether the column values should be converted to 10633 lowercase, uppercase, or remain unchanged 10634 :type column_case: str 10635 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10636 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10637 """ 10638 10639 log.debug("Start transcrpts view creation from columns map...") 10640 10641 # "from_columns_map": [ 10642 # { 10643 # "transcripts_column": "Ensembl_transcriptid", 10644 # "transcripts_infos_columns": [ 10645 # "genename", 10646 # "Ensembl_geneid", 10647 # "LIST_S2_score", 10648 # "LIST_S2_pred", 10649 # ], 10650 # }, 10651 # { 10652 # "transcripts_column": "Ensembl_transcriptid", 10653 # "transcripts_infos_columns": [ 10654 # "genename", 10655 # "VARITY_R_score", 10656 # "Aloft_pred", 10657 # ], 10658 # }, 10659 # ], 10660 10661 # Init 10662 if temporary_tables is None: 10663 temporary_tables = [] 10664 if annotation_fields is None: 10665 annotation_fields = [] 10666 10667 # Variants table 10668 table_variants = self.get_table_variants() 10669 10670 for columns_map in columns_maps: 10671 10672 # Log 10673 log.debug(f"columns_map={columns_map}") 10674 10675 # Transcript column 10676 transcripts_column = columns_map.get("transcripts_column", None) 10677 10678 # Transcripts infos columns 10679 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10680 10681 # Transcripts infos columns rename 10682 column_rename = columns_map.get("column_rename", column_rename) 10683 10684 # Transcripts infos columns clean 10685 column_clean = columns_map.get("column_clean", column_clean) 10686 10687 # Transcripts infos columns case 10688 column_case = columns_map.get("column_case", column_case) 10689 10690 if transcripts_column is not None: 10691 10692 # Explode 10693 added_columns += self.explode_infos( 10694 fields=[transcripts_column] + transcripts_infos_columns 10695 ) 10696 10697 # View clauses 10698 clause_select_variants = [] 10699 clause_select_tanscripts = [] 10700 for field in [transcripts_column] + transcripts_infos_columns: 10701 10702 # AS field 10703 as_field = field 10704 10705 # Rename 10706 if column_rename: 10707 as_field = column_rename.get(as_field, as_field) 10708 10709 # Clean 10710 if column_clean: 10711 as_field = clean_annotation_field(as_field) 10712 10713 # Case 10714 if column_case: 10715 if column_case.lower() in ["lower"]: 10716 as_field = as_field.lower() 10717 elif column_case.lower() in ["upper"]: 10718 as_field = as_field.upper() 10719 10720 # Clause select Variants 10721 clause_select_variants.append( 10722 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10723 ) 10724 10725 if field in [transcripts_column]: 10726 clause_select_tanscripts.append( 10727 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10728 ) 10729 else: 10730 clause_select_tanscripts.append( 10731 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10732 ) 10733 annotation_fields.append(as_field) 10734 10735 # Query View 10736 query = f""" 10737 SELECT 10738 "#CHROM", POS, REF, ALT, INFO, 10739 "{transcripts_column}" AS 'transcript', 10740 {", ".join(clause_select_tanscripts)} 10741 FROM ( 10742 SELECT 10743 "#CHROM", POS, REF, ALT, INFO, 10744 {", ".join(clause_select_variants)} 10745 FROM {table_variants} 10746 ) 10747 WHERE "{transcripts_column}" IS NOT NULL 10748 """ 10749 10750 # Create temporary table 10751 temporary_table = transcripts_table + "".join( 10752 random.choices(string.ascii_uppercase + string.digits, k=10) 10753 ) 10754 10755 # Temporary view 10756 temporary_tables.append(temporary_table) 10757 query_view = f""" 10758 CREATE view {temporary_table} 10759 AS ({query}) 10760 """ 10761 self.execute_query(query=query_view) 10762 10763 return added_columns, temporary_tables, annotation_fields 10764 10765 def create_transcript_view_from_column_format( 10766 self, 10767 transcripts_table: str = "transcripts", 10768 column_formats: dict = {}, 10769 temporary_tables: list = None, 10770 annotation_fields: list = None, 10771 column_rename: dict = {}, 10772 column_clean: bool = False, 10773 column_case: str = None, 10774 ) -> tuple[list, list, list]: 10775 """ 10776 The `create_transcript_view_from_column_format` function generates a transcript view based on 10777 specified column formats, adds additional columns and annotation fields, and returns the list of 10778 temporary tables and annotation fields. 10779 10780 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10781 of the table containing the transcripts data. This table will be used as the base table for 10782 creating the transcript view. The default value for this parameter is "transcripts", but you can 10783 provide a different table name if needed, defaults to transcripts 10784 :type transcripts_table: str (optional) 10785 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10786 about the columns to be used for creating the transcript view. Each entry in the dictionary 10787 specifies the mapping between a transcripts column and a transcripts infos column. This 10788 parameter allows you to define how the columns from the transcripts table should be transformed 10789 or mapped 10790 :type column_formats: dict 10791 :param temporary_tables: The `temporary_tables` parameter in the 10792 `create_transcript_view_from_column_format` function is a list that stores the names of 10793 temporary views created during the process of creating a transcript view from a column format. 10794 These temporary views are used to manipulate and extract data before generating the final 10795 transcript view 10796 :type temporary_tables: list 10797 :param annotation_fields: The `annotation_fields` parameter in the 10798 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10799 that are extracted from the temporary views created during the process. These annotation fields 10800 are obtained by querying the temporary views and extracting the column names excluding specific 10801 columns like `#CH 10802 :type annotation_fields: list 10803 :param column_rename: The `column_rename` parameter in the 10804 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10805 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10806 column names to new column names in this dictionary, you can rename specific columns during the 10807 process 10808 :type column_rename: dict 10809 :param column_clean: The `column_clean` parameter in the 10810 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10811 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10812 will be cleaned during the creation of the transcript view based on the specified column format, 10813 defaults to False 10814 :type column_clean: bool (optional) 10815 :param column_case: The `column_case` parameter in the 10816 `create_transcript_view_from_column_format` function is used to specify the case transformation 10817 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10818 to convert the column names to uppercase or lowercase, respectively 10819 :type column_case: str 10820 :return: The `create_transcript_view_from_column_format` function returns two lists: 10821 `temporary_tables` and `annotation_fields`. 10822 """ 10823 10824 log.debug("Start transcrpts view creation from column format...") 10825 10826 # "from_column_format": [ 10827 # { 10828 # "transcripts_column": "ANN", 10829 # "transcripts_infos_column": "Feature_ID", 10830 # } 10831 # ], 10832 10833 # Init 10834 if temporary_tables is None: 10835 temporary_tables = [] 10836 if annotation_fields is None: 10837 annotation_fields = [] 10838 10839 added_columns = [] 10840 10841 for column_format in column_formats: 10842 10843 # annotation field and transcript annotation field 10844 annotation_field = column_format.get("transcripts_column", "ANN") 10845 transcript_annotation = column_format.get( 10846 "transcripts_infos_column", "Feature_ID" 10847 ) 10848 10849 # Transcripts infos columns rename 10850 column_rename = column_format.get("column_rename", column_rename) 10851 10852 # Transcripts infos columns clean 10853 column_clean = column_format.get("column_clean", column_clean) 10854 10855 # Transcripts infos columns case 10856 column_case = column_format.get("column_case", column_case) 10857 10858 # Temporary View name 10859 temporary_view_name = transcripts_table + "".join( 10860 random.choices(string.ascii_uppercase + string.digits, k=10) 10861 ) 10862 10863 # Create temporary view name 10864 temporary_view_name, added_columns = self.annotation_format_to_table( 10865 annotation_field=annotation_field, 10866 view_name=temporary_view_name, 10867 annotation_id=transcript_annotation, 10868 column_rename=column_rename, 10869 column_clean=column_clean, 10870 column_case=column_case, 10871 ) 10872 10873 # Annotation fields 10874 if temporary_view_name: 10875 query_annotation_fields = f""" 10876 SELECT * 10877 FROM ( 10878 DESCRIBE SELECT * 10879 FROM {temporary_view_name} 10880 ) 10881 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10882 """ 10883 df_annotation_fields = self.get_query_to_df( 10884 query=query_annotation_fields 10885 ) 10886 10887 # Add temporary view and annotation fields 10888 temporary_tables.append(temporary_view_name) 10889 annotation_fields += list(set(df_annotation_fields["column_name"])) 10890 10891 return added_columns, temporary_tables, annotation_fields 10892 10893 def create_transcript_view( 10894 self, 10895 transcripts_table: str = None, 10896 transcripts_table_drop: bool = False, 10897 param: dict = {}, 10898 ) -> str: 10899 """ 10900 The `create_transcript_view` function generates a transcript view by processing data from a 10901 specified table based on provided parameters and structural information. 10902 10903 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10904 is used to specify the name of the table that will store the final transcript view data. If a table 10905 name is not provided, the function will create a new table to store the transcript view data, and by 10906 default,, defaults to transcripts 10907 :type transcripts_table: str (optional) 10908 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10909 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10910 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10911 the function will drop the existing transcripts table if it exists, defaults to False 10912 :type transcripts_table_drop: bool (optional) 10913 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10914 contains information needed to create a transcript view. It includes details such as the structure 10915 of the transcripts, columns mapping, column formats, and other necessary information for generating 10916 the view. This parameter allows for flexibility and customization 10917 :type param: dict 10918 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10919 created or modified during the execution of the function. 10920 """ 10921 10922 log.debug("Start transcripts view creation...") 10923 10924 # Default 10925 transcripts_table_default = "transcripts" 10926 10927 # Param 10928 if not param: 10929 param = self.get_param() 10930 10931 # Struct 10932 struct = param.get("transcripts", {}).get("struct", None) 10933 10934 # Transcript veresion 10935 transcript_id_remove_version = param.get("transcripts", {}).get( 10936 "transcript_id_remove_version", False 10937 ) 10938 10939 # Transcripts mapping 10940 transcript_id_mapping_file = param.get("transcripts", {}).get( 10941 "transcript_id_mapping_file", None 10942 ) 10943 10944 # Transcripts mapping 10945 transcript_id_mapping_force = param.get("transcripts", {}).get( 10946 "transcript_id_mapping_force", None 10947 ) 10948 10949 # Transcripts table 10950 if transcripts_table is None: 10951 transcripts_table = param.get("transcripts", {}).get( 10952 "table", transcripts_table_default 10953 ) 10954 10955 # Check transcripts table exists 10956 if transcripts_table: 10957 10958 # Query to check if transcripts table exists 10959 query_check_table = f""" 10960 SELECT * 10961 FROM information_schema.tables 10962 WHERE table_name = '{transcripts_table}' 10963 """ 10964 df_check_table = self.get_query_to_df(query=query_check_table) 10965 10966 # Check if transcripts table exists 10967 if len(df_check_table) > 0 and not transcripts_table_drop: 10968 log.debug(f"Table {transcripts_table} exists and not drop option") 10969 return transcripts_table 10970 10971 if struct: 10972 10973 # added_columns 10974 added_columns = [] 10975 10976 # Temporary tables 10977 temporary_tables = [] 10978 10979 # Annotation fields 10980 annotation_fields = [] 10981 10982 # from columns map 10983 columns_maps = struct.get("from_columns_map", []) 10984 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10985 self.create_transcript_view_from_columns_map( 10986 transcripts_table=transcripts_table, 10987 columns_maps=columns_maps, 10988 added_columns=added_columns, 10989 temporary_tables=temporary_tables, 10990 annotation_fields=annotation_fields, 10991 ) 10992 ) 10993 added_columns += added_columns_tmp 10994 temporary_tables += temporary_tables_tmp 10995 annotation_fields += annotation_fields_tmp 10996 10997 # from column format 10998 column_formats = struct.get("from_column_format", []) 10999 added_columns, temporary_tables_tmp, annotation_fields_tmp = ( 11000 self.create_transcript_view_from_column_format( 11001 transcripts_table=transcripts_table, 11002 column_formats=column_formats, 11003 temporary_tables=temporary_tables, 11004 annotation_fields=annotation_fields, 11005 ) 11006 ) 11007 added_columns += added_columns_tmp 11008 temporary_tables += temporary_tables_tmp 11009 annotation_fields += annotation_fields_tmp 11010 11011 # Remove some specific fields/column 11012 annotation_fields = list(set(annotation_fields)) 11013 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 11014 if field in annotation_fields: 11015 annotation_fields.remove(field) 11016 11017 # Merge temporary tables query 11018 query_merge = "" 11019 for temporary_table in list(set(temporary_tables)): 11020 11021 # First temporary table 11022 if not query_merge: 11023 query_merge = f""" 11024 SELECT * FROM {temporary_table} 11025 """ 11026 # other temporary table (using UNION) 11027 else: 11028 query_merge += f""" 11029 UNION BY NAME SELECT * FROM {temporary_table} 11030 """ 11031 11032 # transcript table tmp 11033 transcript_table_tmp = "transcripts_tmp" 11034 transcript_table_tmp2 = "transcripts_tmp2" 11035 transcript_table_tmp3 = "transcripts_tmp3" 11036 11037 # Merge on transcript 11038 query_merge_on_transcripts_annotation_fields = [] 11039 11040 # Add transcript list 11041 query_merge_on_transcripts_annotation_fields.append( 11042 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 11043 ) 11044 11045 # Aggregate all annotations fields 11046 for annotation_field in set(annotation_fields): 11047 query_merge_on_transcripts_annotation_fields.append( 11048 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 11049 ) 11050 11051 # Transcripts mapping 11052 if transcript_id_mapping_file: 11053 11054 # Transcript dataframe 11055 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 11056 transcript_id_mapping_dataframe = transcripts_file_to_df( 11057 transcript_id_mapping_file, column_names=["transcript", "alias"] 11058 ) 11059 11060 # Transcript version remove 11061 if transcript_id_remove_version: 11062 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 11063 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 11064 query_left_join = f""" 11065 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11066 """ 11067 else: 11068 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 11069 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 11070 query_left_join = f""" 11071 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11072 """ 11073 11074 # Transcript column for group by merge 11075 query_transcript_merge_group_by = """ 11076 CASE 11077 WHEN transcript_mapped NOT IN ('') 11078 THEN split_part(transcript_mapped, '.', 1) 11079 ELSE split_part(transcript_original, '.', 1) 11080 END 11081 """ 11082 11083 # Merge query 11084 transcripts_tmp2_query = f""" 11085 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 11086 FROM ({query_merge}) AS {transcript_table_tmp} 11087 {query_left_join} 11088 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 11089 """ 11090 11091 # Retrive columns after mege 11092 transcripts_tmp2_describe_query = f""" 11093 DESCRIBE {transcripts_tmp2_query} 11094 """ 11095 transcripts_tmp2_describe_list = list( 11096 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 11097 "column_name" 11098 ] 11099 ) 11100 11101 # Create list of columns for select clause 11102 transcripts_tmp2_describe_select_clause = [] 11103 for field in transcripts_tmp2_describe_list: 11104 if field not in [ 11105 "#CHROM", 11106 "POS", 11107 "REF", 11108 "ALT", 11109 "INFO", 11110 "transcript_mapped", 11111 ]: 11112 as_field = field 11113 if field in ["transcript_original"]: 11114 as_field = "transcripts_mapped" 11115 transcripts_tmp2_describe_select_clause.append( 11116 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11117 ) 11118 11119 # Merge with mapping 11120 query_merge_on_transcripts = f""" 11121 SELECT 11122 "#CHROM", POS, REF, ALT, INFO, 11123 CASE 11124 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11125 THEN ANY_VALUE(transcript_mapped) 11126 ELSE ANY_VALUE(transcript_original) 11127 END AS transcript, 11128 {", ".join(transcripts_tmp2_describe_select_clause)} 11129 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11130 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11131 {query_transcript_merge_group_by} 11132 """ 11133 11134 # Add transcript filter from mapping file 11135 if transcript_id_mapping_force: 11136 query_merge_on_transcripts = f""" 11137 SELECT * 11138 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11139 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11140 """ 11141 11142 # No transcript mapping 11143 else: 11144 11145 # Remove transcript version 11146 if transcript_id_remove_version: 11147 query_transcript_column = f""" 11148 split_part({transcript_table_tmp}.transcript, '.', 1) 11149 """ 11150 else: 11151 query_transcript_column = """ 11152 transcript 11153 """ 11154 11155 # Query sections 11156 query_transcript_column_select = ( 11157 f"{query_transcript_column} AS transcript" 11158 ) 11159 query_transcript_column_group_by = query_transcript_column 11160 11161 # Query for transcripts view 11162 query_merge_on_transcripts = f""" 11163 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11164 FROM ({query_merge}) AS {transcript_table_tmp} 11165 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11166 """ 11167 11168 # Drop transcript view is necessary 11169 if transcripts_table_drop: 11170 query_drop = f""" 11171 DROP TABLE IF EXISTS {transcripts_table}; 11172 """ 11173 self.execute_query(query=query_drop) 11174 11175 # List of unique #CHROM 11176 query_unique_chrom = f""" 11177 SELECT DISTINCT "#CHROM" 11178 FROM variants AS subquery 11179 """ 11180 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11181 11182 # Create table with structure but without data, if not exists 11183 query_create_table = f""" 11184 CREATE TABLE IF NOT EXISTS {transcripts_table} AS 11185 SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0 11186 """ 11187 self.execute_query(query=query_create_table) 11188 11189 # Process by #CHROM 11190 for chrom in unique_chroms["#CHROM"]: 11191 11192 # Log 11193 log.debug(f"Processing #CHROM={chrom}") 11194 11195 # Select data by #CHROM 11196 query_chunk = f""" 11197 SELECT * 11198 FROM ({query_merge_on_transcripts}) 11199 WHERE "#CHROM" = '{chrom}' 11200 """ 11201 11202 # Insert data 11203 query_insert_chunk = f""" 11204 INSERT INTO {transcripts_table} 11205 {query_chunk} 11206 """ 11207 self.execute_query(query=query_insert_chunk) 11208 11209 # Remove temporary tables 11210 if temporary_tables: 11211 for temporary_table in list(set(temporary_tables)): 11212 try: 11213 query_drop_tmp_table = f""" 11214 DROP TABLE IF EXISTS {temporary_table} 11215 """ 11216 self.execute_query(query=query_drop_tmp_table) 11217 except Exception as e: 11218 log.debug(f"'{temporary_table}' Not a table") 11219 try: 11220 query_drop_tmp_table = f""" 11221 DROP VIEW IF EXISTS {temporary_table} 11222 """ 11223 self.execute_query(query=query_drop_tmp_table) 11224 except Exception as e: 11225 log.debug(f"'{temporary_table}' Not a view") 11226 11227 # Remove added columns 11228 for added_column in added_columns: 11229 self.drop_column(column=added_column) 11230 11231 else: 11232 11233 transcripts_table = None 11234 11235 return transcripts_table 11236 11237 def annotation_format_to_table( 11238 self, 11239 annotation_field: str = "ANN", 11240 annotation_id: str = "Feature_ID", 11241 view_name: str = "transcripts", 11242 column_rename: dict = {}, 11243 column_clean: bool = False, 11244 column_case: str = None, 11245 ) -> str: 11246 """ 11247 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11248 structured table format, ensuring unique values and creating a temporary table for further 11249 processing or analysis. 11250 11251 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11252 unique values in the output or not. If set to `True`, the function will make sure that the 11253 output values are unique, defaults to True 11254 :type uniquify: bool (optional) 11255 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11256 that contains the annotation information for each variant. This field is used to extract the 11257 annotation details for further processing in the function. By default, it is set to "ANN", 11258 defaults to ANN 11259 :type annotation_field: str (optional) 11260 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11261 is used to specify the identifier for the annotation feature. This identifier will be used as a 11262 column name in the resulting table or view that is created based on the annotation data. It 11263 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11264 :type annotation_id: str (optional) 11265 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11266 to specify the name of the temporary table that will be created to store the transformed 11267 annotation data. This table will hold the extracted information from the annotation field in a 11268 structured format for further processing or analysis. By default,, defaults to transcripts 11269 :type view_name: str (optional) 11270 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11271 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11272 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11273 created based on the annotation data. This feature enables 11274 :type column_rename: dict 11275 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11276 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11277 If set to `True`, the function will clean the annotation field before further processing. This 11278 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11279 to False 11280 :type column_clean: bool (optional) 11281 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11282 used to specify the case transformation to be applied to the column names extracted from the 11283 annotation data. It allows you to set the case of the column names to either lowercase or 11284 uppercase for consistency or other specific requirements during the conversion 11285 :type column_case: str 11286 :return: The function `annotation_format_to_table` is returning the name of the view created, 11287 which is stored in the variable `view_name`. 11288 """ 11289 11290 # Transcript annotation 11291 if column_rename: 11292 annotation_id = column_rename.get(annotation_id, annotation_id) 11293 11294 if column_clean: 11295 annotation_id = clean_annotation_field(annotation_id) 11296 11297 # Prefix 11298 prefix = self.get_explode_infos_prefix() 11299 if prefix: 11300 prefix = "INFO/" 11301 11302 # Variants table 11303 table_variants = self.get_table_variants() 11304 11305 # Header 11306 vcf_reader = self.get_header() 11307 11308 # Add columns 11309 added_columns = [] 11310 11311 # Explode HGVS field in column 11312 added_columns += self.explode_infos(fields=[annotation_field]) 11313 11314 if annotation_field in vcf_reader.infos: 11315 11316 # Extract ANN header 11317 ann_description = vcf_reader.infos[annotation_field].desc 11318 pattern = r"'(.+?)'" 11319 match = re.search(pattern, ann_description) 11320 if match: 11321 ann_header_match = match.group(1).split(" | ") 11322 ann_header = [] 11323 ann_header_desc = {} 11324 for i in range(len(ann_header_match)): 11325 ann_header_info = "".join( 11326 char for char in ann_header_match[i] if char.isalnum() 11327 ) 11328 ann_header.append(ann_header_info) 11329 ann_header_desc[ann_header_info] = ann_header_match[i] 11330 if not ann_header_desc: 11331 raise ValueError("Invalid header description format") 11332 else: 11333 raise ValueError("Invalid header description format") 11334 11335 # Create dataframe for keys column type 11336 dataframe_annotation_format = self.get_query_to_df( 11337 f""" 11338 WITH exploded_annotations AS ( 11339 SELECT 11340 UNNEST(STRING_SPLIT(ANN, ',')) AS annotation 11341 FROM {table_variants} 11342 ), 11343 split_annotations AS ( 11344 SELECT 11345 {", ".join([f"SPLIT_PART(annotation, '|', {i+1}) AS '{header}'" for i, header in enumerate(ann_header_desc.values())])}, 11346 FROM exploded_annotations 11347 ) 11348 SELECT * FROM split_annotations 11349 LIMIT 1000 11350 """ 11351 ) 11352 11353 # Init 11354 query_list_keys = [] 11355 key_i = 0 11356 11357 for key in dataframe_annotation_format.keys(): 11358 11359 # Key 11360 key_i += 1 11361 key_clean = key 11362 11363 # key rename 11364 if column_rename: 11365 key_clean = column_rename.get(key_clean, key_clean) 11366 11367 # key clean 11368 if column_clean: 11369 key_clean = clean_annotation_field(key_clean) 11370 11371 # Key case 11372 if column_case: 11373 if column_case.lower() in ["lower"]: 11374 key_clean = key_clean.lower() 11375 elif column_case.lower() in ["upper"]: 11376 key_clean = key_clean.upper() 11377 11378 # Detect column type 11379 column_type = detect_column_type(dataframe_annotation_format[key]) 11380 11381 # Append key to list 11382 query_list_keys.append( 11383 f""" NULLIF(SPLIT_PART(annotation, '|', {key_i}), '')::{column_type} AS '{prefix}{key_clean}' """ 11384 ) 11385 11386 # Create temporary table 11387 query_create_view = f""" 11388 CREATE VIEW {view_name} AS ( 11389 WITH exploded_annotations AS ( 11390 SELECT 11391 "#CHROM", 11392 POS, 11393 REF, 11394 ALT, 11395 INFO, 11396 UNNEST(STRING_SPLIT(ANN, ',')) AS annotation 11397 FROM {table_variants} 11398 ), 11399 split_annotations AS ( 11400 SELECT 11401 "#CHROM", 11402 POS, 11403 REF, 11404 ALT, 11405 INFO, 11406 {", ".join(query_list_keys)}, 11407 FROM exploded_annotations 11408 ) 11409 SELECT *, {annotation_id} AS 'transcript' FROM split_annotations 11410 ) 11411 """ 11412 log.debug(f"query_create_view: {query_create_view}") 11413 self.execute_query(query=query_create_view) 11414 11415 else: 11416 11417 # Return None 11418 view_name = None 11419 11420 return view_name, added_columns 11421 11422 def transcript_view_to_variants( 11423 self, 11424 transcripts_table: str = None, 11425 transcripts_column_id: str = None, 11426 transcripts_info_json: str = None, 11427 transcripts_info_field_json: str = None, 11428 transcripts_info_format: str = None, 11429 transcripts_info_field_format: str = None, 11430 param: dict = {}, 11431 ) -> bool: 11432 """ 11433 The `transcript_view_to_variants` function updates a variants table with information from 11434 transcripts in JSON format. 11435 11436 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11437 table containing the transcripts data. If this parameter is not provided, the function will 11438 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11439 :type transcripts_table: str 11440 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11441 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11442 identifier is used to match transcripts with variants in the database 11443 :type transcripts_column_id: str 11444 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11445 of the column in the variants table where the transcripts information will be stored in JSON 11446 format. This parameter allows you to define the column in the variants table that will hold the 11447 JSON-formatted information about transcripts 11448 :type transcripts_info_json: str 11449 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11450 specify the field in the VCF header that will contain information about transcripts in JSON 11451 format. This field will be added to the VCF header as an INFO field with the specified name 11452 :type transcripts_info_field_json: str 11453 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11454 format of the information about transcripts that will be stored in the variants table. This 11455 format can be used to define how the transcript information will be structured or displayed 11456 within the variants table 11457 :type transcripts_info_format: str 11458 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11459 specify the field in the VCF header that will contain information about transcripts in a 11460 specific format. This field will be added to the VCF header as an INFO field with the specified 11461 name 11462 :type transcripts_info_field_format: str 11463 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11464 that contains various configuration settings related to transcripts. It is used to provide 11465 default values for certain parameters if they are not explicitly provided when calling the 11466 method. The `param` dictionary can be passed as an argument 11467 :type param: dict 11468 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11469 if the operation is successful and `False` if certain conditions are not met. 11470 """ 11471 11472 msg_info_prefix = "Start transcripts view to variants annotations" 11473 11474 log.debug(f"{msg_info_prefix}...") 11475 11476 # Default 11477 transcripts_table_default = "transcripts" 11478 transcripts_column_id_default = "transcript" 11479 transcripts_info_json_default = None 11480 transcripts_info_format_default = None 11481 transcripts_info_field_json_default = None 11482 transcripts_info_field_format_default = None 11483 11484 # Param 11485 if not param: 11486 param = self.get_param() 11487 11488 # Transcripts table 11489 if transcripts_table is None: 11490 transcripts_table = param.get("transcripts", {}).get( 11491 "table", transcripts_table_default 11492 ) 11493 11494 # Transcripts column ID 11495 if transcripts_column_id is None: 11496 transcripts_column_id = param.get("transcripts", {}).get( 11497 "column_id", transcripts_column_id_default 11498 ) 11499 11500 # Transcripts info json 11501 if transcripts_info_json is None: 11502 transcripts_info_json = param.get("transcripts", {}).get( 11503 "transcripts_info_json", transcripts_info_json_default 11504 ) 11505 11506 # Transcripts info field JSON 11507 if transcripts_info_field_json is None: 11508 transcripts_info_field_json = param.get("transcripts", {}).get( 11509 "transcripts_info_field_json", transcripts_info_field_json_default 11510 ) 11511 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11512 # transcripts_info_json = transcripts_info_field_json 11513 11514 # Transcripts info format 11515 if transcripts_info_format is None: 11516 transcripts_info_format = param.get("transcripts", {}).get( 11517 "transcripts_info_format", transcripts_info_format_default 11518 ) 11519 11520 # Transcripts info field FORMAT 11521 if transcripts_info_field_format is None: 11522 transcripts_info_field_format = param.get("transcripts", {}).get( 11523 "transcripts_info_field_format", transcripts_info_field_format_default 11524 ) 11525 # if ( 11526 # transcripts_info_field_format is not None 11527 # and transcripts_info_format is None 11528 # ): 11529 # transcripts_info_format = transcripts_info_field_format 11530 11531 # Variants table 11532 table_variants = self.get_table_variants() 11533 11534 # Check info columns param 11535 if ( 11536 transcripts_info_json is None 11537 and transcripts_info_field_json is None 11538 and transcripts_info_format is None 11539 and transcripts_info_field_format is None 11540 ): 11541 return False 11542 11543 # Transcripts infos columns 11544 query_transcripts_infos_columns = f""" 11545 SELECT * 11546 FROM ( 11547 DESCRIBE SELECT * FROM {transcripts_table} 11548 ) 11549 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11550 """ 11551 transcripts_infos_columns = list( 11552 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11553 ) 11554 11555 # View results 11556 clause_select = [] 11557 clause_to_json = [] 11558 clause_to_format = [] 11559 for field in transcripts_infos_columns: 11560 # Do not consider INFO field for export into fields 11561 if field not in ["INFO"]: 11562 clause_select.append( 11563 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11564 ) 11565 clause_to_json.append(f""" '{field}': "{field}" """) 11566 clause_to_format.append(f""" "{field}" """) 11567 11568 # Update 11569 update_set_json = [] 11570 update_set_format = [] 11571 11572 # VCF header 11573 vcf_reader = self.get_header() 11574 11575 # Transcripts to info column in JSON 11576 if transcripts_info_json: 11577 11578 # Create column on variants table 11579 self.add_column( 11580 table_name=table_variants, 11581 column_name=transcripts_info_json, 11582 column_type="JSON", 11583 default_value=None, 11584 drop=False, 11585 ) 11586 11587 # Add header 11588 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11589 transcripts_info_json, 11590 ".", 11591 "String", 11592 "Transcripts in JSON format", 11593 "unknwon", 11594 "unknwon", 11595 self.code_type_map["String"], 11596 ) 11597 11598 # Add to update 11599 update_set_json.append( 11600 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11601 ) 11602 11603 # Transcripts to info field in JSON 11604 if transcripts_info_field_json: 11605 11606 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11607 11608 # Add to update 11609 update_set_json.append( 11610 f""" 11611 INFO = concat( 11612 CASE 11613 WHEN INFO NOT IN ('', '.') 11614 THEN INFO 11615 ELSE '' 11616 END, 11617 CASE 11618 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11619 THEN concat( 11620 ';{transcripts_info_field_json}=', 11621 t.{transcripts_info_json} 11622 ) 11623 ELSE '' 11624 END 11625 ) 11626 """ 11627 ) 11628 11629 # Add header 11630 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11631 transcripts_info_field_json, 11632 ".", 11633 "String", 11634 "Transcripts in JSON format", 11635 "unknwon", 11636 "unknwon", 11637 self.code_type_map["String"], 11638 ) 11639 11640 if update_set_json: 11641 11642 # Update query 11643 query_update = f""" 11644 UPDATE {table_variants} 11645 SET {", ".join(update_set_json)} 11646 FROM 11647 ( 11648 SELECT 11649 "#CHROM", POS, REF, ALT, 11650 concat( 11651 '{{', 11652 string_agg( 11653 '"' || "{transcripts_column_id}" || '":' || 11654 to_json(json_output) 11655 ), 11656 '}}' 11657 )::JSON AS {transcripts_info_json} 11658 FROM 11659 ( 11660 SELECT 11661 "#CHROM", POS, REF, ALT, 11662 "{transcripts_column_id}", 11663 to_json( 11664 {{{",".join(clause_to_json)}}} 11665 )::JSON AS json_output 11666 FROM 11667 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11668 WHERE "{transcripts_column_id}" IS NOT NULL 11669 ) 11670 GROUP BY "#CHROM", POS, REF, ALT 11671 ) AS t 11672 WHERE {table_variants}."#CHROM" = t."#CHROM" 11673 AND {table_variants}."POS" = t."POS" 11674 AND {table_variants}."REF" = t."REF" 11675 AND {table_variants}."ALT" = t."ALT" 11676 """ 11677 11678 self.execute_query(query=query_update) 11679 11680 # Transcripts to info column in FORMAT 11681 if transcripts_info_format: 11682 11683 # Create column on variants table 11684 self.add_column( 11685 table_name=table_variants, 11686 column_name=transcripts_info_format, 11687 column_type="VARCHAR", 11688 default_value=None, 11689 drop=False, 11690 ) 11691 11692 # Add header 11693 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11694 transcripts_info_format, 11695 ".", 11696 "String", 11697 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11698 "unknwon", 11699 "unknwon", 11700 self.code_type_map["String"], 11701 ) 11702 11703 # Add to update 11704 update_set_format.append( 11705 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11706 ) 11707 11708 else: 11709 11710 # Set variable for internal queries 11711 transcripts_info_format = "transcripts_info_format" 11712 11713 # Transcripts to info field in JSON 11714 if transcripts_info_field_format: 11715 11716 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11717 11718 # Add to update 11719 update_set_format.append( 11720 f""" 11721 INFO = concat( 11722 CASE 11723 WHEN INFO NOT IN ('', '.') 11724 THEN INFO 11725 ELSE '' 11726 END, 11727 CASE 11728 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11729 THEN concat( 11730 ';{transcripts_info_field_format}=', 11731 t.{transcripts_info_format} 11732 ) 11733 ELSE '' 11734 END 11735 ) 11736 """ 11737 ) 11738 11739 # Add header 11740 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11741 transcripts_info_field_format, 11742 ".", 11743 "String", 11744 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11745 "unknwon", 11746 "unknwon", 11747 self.code_type_map["String"], 11748 ) 11749 11750 if update_set_format: 11751 11752 # Update query 11753 query_update = f""" 11754 UPDATE {table_variants} 11755 SET {", ".join(update_set_format)} 11756 FROM 11757 ( 11758 SELECT 11759 "#CHROM", POS, REF, ALT, 11760 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11761 FROM 11762 ( 11763 SELECT 11764 "#CHROM", POS, REF, ALT, 11765 "{transcripts_column_id}", 11766 concat( 11767 "{transcripts_column_id}", 11768 '|', 11769 {", '|', ".join(clause_to_format)} 11770 ) AS {transcripts_info_format} 11771 FROM 11772 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11773 ) 11774 GROUP BY "#CHROM", POS, REF, ALT 11775 ) AS t 11776 WHERE {table_variants}."#CHROM" = t."#CHROM" 11777 AND {table_variants}."POS" = t."POS" 11778 AND {table_variants}."REF" = t."REF" 11779 AND {table_variants}."ALT" = t."ALT" 11780 """ 11781 11782 self.execute_query(query=query_update) 11783 11784 return True 11785 11786 def rename_info_fields( 11787 self, fields_to_rename: dict = None, table: str = None 11788 ) -> dict: 11789 """ 11790 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11791 corresponding INFO fields in the variants table. 11792 11793 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11794 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11795 represent the original field names that need to be renamed, and the corresponding values 11796 represent the new names to which the fields should be 11797 :type fields_to_rename: dict 11798 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11799 the table in which the variants data is stored. This table contains information about genetic 11800 variants, and the function updates the corresponding INFO fields in this table when renaming 11801 specified fields in the VCF file header 11802 :type table: str 11803 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11804 the original field names as keys and their corresponding new names (or None if the field was 11805 removed) as values after renaming or removing specified fields in a VCF file header and updating 11806 corresponding INFO fields in the variants table. 11807 """ 11808 11809 # Init 11810 fields_renamed = {} 11811 config = self.get_config() 11812 access = config.get("access") 11813 11814 if table is None: 11815 table = self.get_table_variants() 11816 11817 # regexp replace fonction 11818 regex_replace_dict = {} 11819 regex_replace_nb = 0 11820 regex_replace_partition = 125 11821 regex_replace = "concat(INFO, ';')" # Add ';' to reduce regexp comlexity 11822 11823 if fields_to_rename is not None and access not in ["RO"]: 11824 11825 log.info("Rename or remove fields...") 11826 11827 # Header 11828 header = self.get_header() 11829 11830 for field_to_rename, field_renamed in fields_to_rename.items(): 11831 11832 if field_to_rename in header.infos: 11833 11834 # Rename header 11835 if field_renamed is not None: 11836 header.infos[field_renamed] = vcf.parser._Info( 11837 field_renamed, 11838 header.infos[field_to_rename].num, 11839 header.infos[field_to_rename].type, 11840 header.infos[field_to_rename].desc, 11841 header.infos[field_to_rename].source, 11842 header.infos[field_to_rename].version, 11843 header.infos[field_to_rename].type_code, 11844 ) 11845 del header.infos[field_to_rename] 11846 11847 # Rename INFO patterns 11848 field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;" 11849 if field_renamed is not None: 11850 field_renamed_pattern = rf"\1{field_renamed}\3;" 11851 else: 11852 field_renamed_pattern = r"\1" 11853 11854 # regexp replace 11855 regex_replace_nb += 1 11856 regex_replace_key = math.floor( 11857 regex_replace_nb / regex_replace_partition 11858 ) 11859 if (regex_replace_nb % regex_replace_partition) == 0: 11860 regex_replace = "concat(INFO, ';')" 11861 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11862 regex_replace_dict[regex_replace_key] = regex_replace 11863 11864 # Return 11865 fields_renamed[field_to_rename] = field_renamed 11866 11867 # Log 11868 if field_renamed is not None: 11869 log.info( 11870 f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'" 11871 ) 11872 else: 11873 log.info( 11874 f"Rename or remove fields - field '{field_to_rename}' removed" 11875 ) 11876 11877 else: 11878 11879 log.warning( 11880 f"Rename or remove fields - field '{field_to_rename}' not in header" 11881 ) 11882 11883 # Rename INFO 11884 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11885 log.info( 11886 f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..." 11887 ) 11888 query = f""" 11889 UPDATE {table} 11890 SET 11891 INFO = regexp_replace({regex_replace}, ';$', '') 11892 """ 11893 log.debug(f"query={query}") 11894 self.execute_query(query=query) 11895 11896 return fields_renamed 11897 11898 def calculation_rename_info_fields( 11899 self, 11900 fields_to_rename: dict = None, 11901 table: str = None, 11902 operation_name: str = "RENAME_INFO_FIELDS", 11903 ) -> None: 11904 """ 11905 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11906 fields to rename and table if provided, and then calls another function to rename the fields. 11907 11908 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11909 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11910 the key and the new field name as the value 11911 :type fields_to_rename: dict 11912 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11913 specify the name of the table for which the fields are to be renamed. It is a string type 11914 parameter 11915 :type table: str 11916 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11917 method is a string that specifies the name of the operation being performed. In this context, it 11918 is used as a default value for the operation name if not explicitly provided when calling the 11919 function, defaults to RENAME_INFO_FIELDS 11920 :type operation_name: str (optional) 11921 """ 11922 11923 # Param 11924 param = self.get_param() 11925 11926 # Get param fields to rename 11927 param_fields_to_rename = ( 11928 param.get("calculation", {}) 11929 .get("calculations", {}) 11930 .get(operation_name, {}) 11931 .get("fields_to_rename", None) 11932 ) 11933 11934 # Get param table 11935 param_table = ( 11936 param.get("calculation", {}) 11937 .get("calculations", {}) 11938 .get(operation_name, {}) 11939 .get("table", None) 11940 ) 11941 11942 # Init fields_to_rename 11943 if fields_to_rename is None: 11944 fields_to_rename = param_fields_to_rename 11945 11946 # Init table 11947 if table is None: 11948 table = param_table 11949 11950 renamed_fields = self.rename_info_fields( 11951 fields_to_rename=fields_to_rename, table=table 11952 ) 11953 11954 log.debug(f"renamed_fields:{renamed_fields}") 11955 11956 def create_annotations_view( 11957 self, 11958 table: str = None, 11959 view: str = None, 11960 view_type: str = None, 11961 fields: list = None, 11962 prefix: str = "", 11963 drop_view: bool = False, 11964 fields_to_rename: dict = None, 11965 limit: int = None, 11966 ) -> str: 11967 """ 11968 The `create_annotations_view` function creates a SQL view from fields in a VCF INFO column. 11969 11970 :param table: The `table` parameter in the `create_annotations_view` function is used to specify 11971 the name of the table from which the fields are to be extracted. This table contains the 11972 variants data, and the function creates a view based on the fields in the INFO column of this 11973 table 11974 :type table: str 11975 :param view: The `view` parameter in the `create_annotations_view` function is used to specify 11976 the name of the view that will be created based on the fields in the VCF INFO column. This view 11977 will contain the extracted fields from the INFO column in a structured format for further 11978 processing or analysis 11979 :type view: str 11980 :param view_type: The `view_type` parameter in the `create_annotations_view` function is used to 11981 specify the type of view that will be created. It can be either a `VIEW` or a `TABLE`, and the 11982 function will create the view based on the specified type 11983 :type view_type: str 11984 :param fields: The `fields` parameter in the `create_annotations_view` function is a list that 11985 contains the names of the fields to be extracted from the INFO column in the VCF file. These 11986 fields will be used to create the view with the specified columns and data extracted from the 11987 INFO column 11988 :type fields: list 11989 :param prefix: The `prefix` parameter in the `create_annotations_view` function is used to 11990 specify a prefix that will be added to the field names in the view. This prefix helps in 11991 distinguishing the fields extracted from the INFO column in the view 11992 :type prefix: str 11993 :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean 11994 flag that determines whether to drop the existing view with the same name before creating a new 11995 view. If set to `True`, the function will drop the existing view before creating a new view with 11996 the specified name 11997 :type drop_view: bool 11998 :param fields_to_rename: The `fields_to_rename` parameter in the `create_annotations_view` 11999 function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The 12000 keys in the dictionary represent the original field names that need to be renamed, and the 12001 corresponding values represent the new names to which the fields should be 12002 :type fields_to_rename: dict 12003 :param limit: The `limit` parameter in the `create_annotations_view` function is an integer that 12004 specifies the maximum number of rows to be included in the view. If provided, the function will 12005 limit the number of rows in the view to the specified value 12006 :type limit: int 12007 :return: The `create_annotations_view` function returns the name of the view that is created 12008 based on the fields extracted from the INFO column in the VCF file. This view contains the 12009 extracted fields in a structured format for further processing or analysis 12010 """ 12011 12012 # Create a sql view from fields in VCF INFO column, with each column is a field present in the VCF header (with a specific type from VCF header) and extracted from INFO column (with a regexp like in rename_info_fields), and each row is a variant. 12013 12014 # Get table 12015 if table is None: 12016 table = self.get_table_variants() 12017 12018 # Get view 12019 if view is None: 12020 view = f"{table}_annotations" 12021 12022 # Get view type 12023 if view_type is None: 12024 view_type = "VIEW" 12025 12026 # Check view type value 12027 if view_type.upper() not in ["VIEW", "TABLE"]: 12028 raise ValueError( 12029 f"Invalid view type value: {view_type}. Either 'VIEW' or 'TABLE'" 12030 ) 12031 12032 # Get header 12033 header = self.get_header() 12034 12035 # Get fields 12036 if fields is None: 12037 fields = list(header.infos.keys()) 12038 12039 # Get fields to rename 12040 if fields_to_rename is None: 12041 fields_to_rename = {} 12042 12043 log.info( 12044 f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields" 12045 ) 12046 12047 # Describe table 12048 table_describe_query = f""" 12049 DESCRIBE {table} 12050 """ 12051 table_describe = self.get_query_to_df(query=table_describe_query) 12052 12053 # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header 12054 fields_columns = [] 12055 fields_needed = ["#CHROM", "POS", "REF", "ALT"] 12056 field_sql_type_list = False 12057 for field in fields: 12058 12059 # Rename field 12060 field_to_rename = fields_to_rename.get(field, field) 12061 12062 # Check field type 12063 12064 # Needed fields 12065 if field in fields_needed: 12066 continue 12067 12068 # Fields in table 12069 elif field in list(table_describe.get("column_name")): 12070 fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """) 12071 12072 # Fields in header 12073 elif field in header.infos: 12074 12075 # Field info 12076 field_infos = header.infos.get(field, None) 12077 12078 # Field SQL type 12079 field_sql_type = code_type_map_to_sql.get(field_infos.type, "VARCHAR") 12080 12081 # Column is a list 12082 if field_infos.num != 1: 12083 field_sql_type_list = True 12084 12085 # Colonne is a flag 12086 if field_infos.type == "Flag": 12087 field_pattern = rf"(^|;)({field})([^;]*)?" 12088 fields_columns.append( 12089 f""" regexp_matches("INFO", '{field_pattern}')::BOOLEAN AS '{prefix}{field_to_rename}' """ 12090 ) 12091 12092 # Colonne with a type 12093 else: 12094 12095 # Field pattern 12096 field_pattern = rf"(^|;)({field})=([^;]*)?" 12097 12098 # Field is a list 12099 if field_sql_type_list: 12100 fields_columns.append( 12101 f""" CAST(list_transform(string_split(NULLIF(regexp_extract("INFO", '{field_pattern}', 3), ''), ','), x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END) AS {field_sql_type}[]) AS '{prefix}{field_to_rename}' """ 12102 ) 12103 12104 # Field is a unique value 12105 else: 12106 fields_columns.append( 12107 f""" NULLIF(regexp_replace(regexp_extract("INFO", '{field_pattern}', 3), '^\\.$', ''), '')::{field_sql_type} AS '{prefix}{field_to_rename}' """ 12108 ) 12109 12110 else: 12111 fields_columns.append(f""" null AS '{prefix}{field_to_rename}' """) 12112 msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL" 12113 log.warning(msg=msg_err) 12114 12115 # Limit 12116 limit_clause = "" 12117 if limit is not None: 12118 limit_clause = f" LIMIT {limit} " 12119 12120 # Query select 12121 query_select = f""" 12122 SELECT 12123 {', '.join([f'"{field}"' for field in fields_needed])}, {", ".join(fields_columns)} 12124 FROM 12125 {table} 12126 {limit_clause} 12127 """ 12128 12129 # Drop if any 12130 if drop_view: 12131 log.debug(f"Drop view: {view}") 12132 query_create_view = f""" 12133 DROP {view_type} IF EXISTS {view} 12134 """ 12135 self.execute_query(query=query_create_view) 12136 log.debug(f"View dropped: {view}") 12137 12138 # Create view 12139 log.debug(f"Create view: {view}") 12140 query_create_view = f""" 12141 CREATE {view_type} IF NOT EXISTS {view} AS {query_select} 12142 """ 12143 # log.debug(f"query_create_view:{query_create_view}") 12144 self.execute_query(query=query_create_view) 12145 log.debug(f"View created: {view}") 12146 12147 return view
39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None
The function prints the input, output, config, and dataframe of the current object
571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config
It returns the config
Returns
The config variable is being returned.
994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param
It returns the param
Returns
The param variable is being returned.
1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn
It returns the connection object
Returns
The connection object.
1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(rf"^{field}$") 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None, 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 )
The export_output function exports data from a VCF file to various formats, including VCF,
CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
partitioning.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True - query: The
queryparameter in theexport_outputfunction is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage - threads: The
threadsparameter in theexport_outputfunction specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads - sort: The
sortparameter in theexport_outputfunction is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. Ifsortis set toTrue, the output file will be sorted. Ifsortis set toFalse,, defaults to False - index: The
indexparameter in theexport_outputfunction is a boolean flag that determines whether an index should be created on the output file. Ifindexis set toTrue, an index will be created on the output file. Ifindexis set toFalse, no, defaults to False - order_by: The
order_byparameter in theexport_outputfunction is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be - fields_to_rename: The
fields_to_renameparameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns
The
export_outputfunction returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join( 2519 [f""" "{sample}" """ for sample in list_samples] 2520 ) 2521 else: 2522 samples_fields = "" 2523 log.debug(f"samples_fields: {samples_fields}") 2524 else: 2525 samples_fields = "" 2526 2527 # Where clause 2528 if where_clause is None: 2529 where_clause = "" 2530 2531 # Variants 2532 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2533 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2534 log.debug(f"sql_query_select={sql_query_select}") 2535 2536 return self.export_output( 2537 output_file=vcf_file, 2538 output_header=None, 2539 export_header=True, 2540 query=sql_query_select, 2541 parquet_partitions=None, 2542 chunk_size=config.get("chunk_size", None), 2543 threads=threads, 2544 sort=True, 2545 index=index, 2546 order_by=None, 2547 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2549 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2550 """ 2551 It takes a list of commands and runs them in parallel using the number of threads specified 2552 2553 :param commands: A list of commands to run 2554 :param threads: The number of threads to use, defaults to 1 (optional) 2555 """ 2556 2557 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2559 def get_threads(self, default: int = 1) -> int: 2560 """ 2561 This function returns the number of threads to use for a job, with a default value of 1 if not 2562 specified. 2563 2564 :param default: The `default` parameter in the `get_threads` method is used to specify the 2565 default number of threads to use if no specific value is provided. If no value is provided for 2566 the `threads` parameter in the configuration or input parameters, the `default` value will be 2567 used, defaults to 1 2568 :type default: int (optional) 2569 :return: the number of threads to use for the current job. 2570 """ 2571 2572 # Config 2573 config = self.get_config() 2574 2575 # Param 2576 param = self.get_param() 2577 2578 # Input threads 2579 input_thread = param.get("threads", config.get("threads", None)) 2580 2581 # Check threads 2582 if not input_thread: 2583 threads = default 2584 elif int(input_thread) <= 0: 2585 threads = os.cpu_count() 2586 else: 2587 threads = int(input_thread) 2588 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2590 def get_memory(self, default: str = None) -> str: 2591 """ 2592 This function retrieves the memory value from parameters or configuration with a default value 2593 if not found. 2594 2595 :param default: The `get_memory` function takes in a default value as a string parameter. This 2596 default value is used as a fallback in case the `memory` parameter is not provided in the 2597 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2598 the function 2599 :type default: str 2600 :return: The `get_memory` function returns a string value representing the memory parameter. If 2601 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2602 return the default value provided as an argument to the function. 2603 """ 2604 2605 # Config 2606 config = self.get_config() 2607 2608 # Param 2609 param = self.get_param() 2610 2611 # Input threads 2612 input_memory = param.get("memory", config.get("memory", None)) 2613 2614 # Check threads 2615 if input_memory: 2616 memory = input_memory 2617 else: 2618 memory = default 2619 2620 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2622 def update_from_vcf(self, vcf_file: str) -> None: 2623 """ 2624 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2625 2626 :param vcf_file: the path to the VCF file 2627 """ 2628 2629 connexion_format = self.get_connexion_format() 2630 2631 if connexion_format in ["duckdb"]: 2632 self.update_from_vcf_duckdb(vcf_file) 2633 elif connexion_format in ["sqlite"]: 2634 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2636 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2637 """ 2638 It takes a VCF file and updates the INFO column of the variants table in the database with the 2639 INFO column of the VCF file 2640 2641 :param vcf_file: the path to the VCF file 2642 """ 2643 2644 # varaints table 2645 table_variants = self.get_table_variants() 2646 2647 # Loading VCF into temporaire table 2648 skip = self.get_header_length(file=vcf_file) 2649 vcf_df = pd.read_csv( 2650 vcf_file, 2651 sep="\t", 2652 engine="c", 2653 skiprows=skip, 2654 header=0, 2655 low_memory=False, 2656 ) 2657 sql_query_update = f""" 2658 UPDATE {table_variants} as table_variants 2659 SET INFO = concat( 2660 CASE 2661 WHEN INFO NOT IN ('', '.') 2662 THEN INFO 2663 ELSE '' 2664 END, 2665 ( 2666 SELECT 2667 concat( 2668 CASE 2669 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2670 THEN ';' 2671 ELSE '' 2672 END 2673 , 2674 CASE 2675 WHEN table_parquet.INFO NOT IN ('','.') 2676 THEN table_parquet.INFO 2677 ELSE '' 2678 END 2679 ) 2680 FROM vcf_df as table_parquet 2681 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2682 AND table_parquet.\"POS\" = table_variants.\"POS\" 2683 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2684 AND table_parquet.\"REF\" = table_variants.\"REF\" 2685 AND table_parquet.INFO NOT IN ('','.') 2686 ) 2687 ) 2688 ; 2689 """ 2690 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2692 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2693 """ 2694 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2695 table, then updates the INFO column of the variants table with the INFO column of the temporary 2696 table 2697 2698 :param vcf_file: The path to the VCF file you want to update the database with 2699 """ 2700 2701 # Create a temporary table for the VCF 2702 table_vcf = "tmp_vcf" 2703 sql_create = ( 2704 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2705 ) 2706 self.conn.execute(sql_create) 2707 2708 # Loading VCF into temporaire table 2709 vcf_df = pd.read_csv( 2710 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2711 ) 2712 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2713 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2714 2715 # Update table 'variants' with VCF data 2716 # warning: CONCAT as || operator 2717 sql_query_update = f""" 2718 UPDATE variants as table_variants 2719 SET INFO = CASE 2720 WHEN INFO NOT IN ('', '.') 2721 THEN INFO 2722 ELSE '' 2723 END || 2724 ( 2725 SELECT 2726 CASE 2727 WHEN table_variants.INFO NOT IN ('','.') 2728 AND table_vcf.INFO NOT IN ('','.') 2729 THEN ';' 2730 ELSE '' 2731 END || 2732 CASE 2733 WHEN table_vcf.INFO NOT IN ('','.') 2734 THEN table_vcf.INFO 2735 ELSE '' 2736 END 2737 FROM {table_vcf} as table_vcf 2738 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2739 AND table_vcf.\"POS\" = table_variants.\"POS\" 2740 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2741 AND table_vcf.\"REF\" = table_variants.\"REF\" 2742 ) 2743 """ 2744 self.conn.execute(sql_query_update) 2745 2746 # Drop temporary table 2747 sql_drop = f"DROP TABLE {table_vcf}" 2748 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2750 def drop_variants_table(self) -> None: 2751 """ 2752 > This function drops the variants table 2753 """ 2754 2755 table_variants = self.get_table_variants() 2756 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2757 self.conn.execute(sql_table_variants)
This function drops the variants table
2759 def set_variant_id( 2760 self, variant_id_column: str = "variant_id", force: bool = None 2761 ) -> str: 2762 """ 2763 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2764 `#CHROM`, `POS`, `REF`, and `ALT` columns 2765 2766 :param variant_id_column: The name of the column to be created in the variants table, defaults 2767 to variant_id 2768 :type variant_id_column: str (optional) 2769 :param force: If True, the variant_id column will be created even if it already exists 2770 :type force: bool 2771 :return: The name of the column that contains the variant_id 2772 """ 2773 2774 # Assembly 2775 assembly = self.get_param().get( 2776 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2777 ) 2778 2779 # INFO/Tag prefix 2780 prefix = self.get_explode_infos_prefix() 2781 2782 # Explode INFO/SVTYPE 2783 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2784 2785 # variants table 2786 table_variants = self.get_table_variants() 2787 2788 # variant_id column 2789 if not variant_id_column: 2790 variant_id_column = "variant_id" 2791 2792 # Creta variant_id column 2793 if "variant_id" not in self.get_extra_infos() or force: 2794 2795 # Create column 2796 self.add_column( 2797 table_name=table_variants, 2798 column_name=variant_id_column, 2799 column_type="UBIGINT", 2800 default_value="0", 2801 ) 2802 2803 # Update column 2804 self.conn.execute( 2805 f""" 2806 UPDATE {table_variants} 2807 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2808 """ 2809 ) 2810 2811 # Remove added columns 2812 for added_column in added_columns: 2813 self.drop_column(column=added_column) 2814 2815 # return variant_id column name 2816 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2818 def get_variant_id_column( 2819 self, variant_id_column: str = "variant_id", force: bool = None 2820 ) -> str: 2821 """ 2822 This function returns the variant_id column name 2823 2824 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2825 defaults to variant_id 2826 :type variant_id_column: str (optional) 2827 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2828 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2829 if it is not already set, or if it is set 2830 :type force: bool 2831 :return: The variant_id column name. 2832 """ 2833 2834 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2840 def scan_databases( 2841 self, 2842 database_formats: list = ["parquet"], 2843 database_releases: list = ["current"], 2844 ) -> dict: 2845 """ 2846 The function `scan_databases` scans for available databases based on specified formats and 2847 releases. 2848 2849 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2850 of the databases to be scanned. In this case, the accepted format is "parquet" 2851 :type database_formats: list ["parquet"] 2852 :param database_releases: The `database_releases` parameter is a list that specifies the 2853 releases of the databases to be scanned. In the provided function, the default value for 2854 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2855 databases that are in the "current" 2856 :type database_releases: list 2857 :return: The function `scan_databases` returns a dictionary containing information about 2858 databases that match the specified formats and releases. 2859 """ 2860 2861 # Config 2862 config = self.get_config() 2863 2864 # Param 2865 param = self.get_param() 2866 2867 # Param - Assembly 2868 assembly = param.get("assembly", config.get("assembly", None)) 2869 if not assembly: 2870 assembly = DEFAULT_ASSEMBLY 2871 log.warning(f"Default assembly '{assembly}'") 2872 2873 # Scan for availabled databases 2874 log.info( 2875 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2876 ) 2877 databases_infos_dict = databases_infos( 2878 database_folder_releases=database_releases, 2879 database_formats=database_formats, 2880 assembly=assembly, 2881 config=config, 2882 ) 2883 log.info( 2884 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2885 ) 2886 2887 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2889 def annotation(self) -> None: 2890 """ 2891 It annotates the VCF file with the annotations specified in the config file. 2892 """ 2893 2894 # Config 2895 config = self.get_config() 2896 2897 # Param 2898 param = self.get_param() 2899 2900 # Param - Assembly 2901 assembly = param.get("assembly", config.get("assembly", None)) 2902 if not assembly: 2903 assembly = DEFAULT_ASSEMBLY 2904 log.warning(f"Default assembly '{assembly}'") 2905 2906 # annotations databases folders 2907 annotations_databases = set( 2908 config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("parquet", ["~/howard/databases/parquet/current"]) 2914 + config.get("folders", {}) 2915 .get("databases", {}) 2916 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2917 ) 2918 2919 # Get param annotations 2920 if param.get("annotations", None) and isinstance( 2921 param.get("annotations", None), str 2922 ): 2923 log.debug(param.get("annotations", None)) 2924 param_annotation_list = param.get("annotations").split(",") 2925 else: 2926 param_annotation_list = [] 2927 2928 # Each tools param 2929 if param.get("annotation_parquet", None) != None: 2930 log.debug( 2931 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2932 ) 2933 if isinstance(param.get("annotation_parquet", None), list): 2934 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2935 else: 2936 param_annotation_list.append(param.get("annotation_parquet")) 2937 if param.get("annotation_snpsift", None) != None: 2938 if isinstance(param.get("annotation_snpsift", None), list): 2939 param_annotation_list.append( 2940 "snpsift:" 2941 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2942 ) 2943 else: 2944 param_annotation_list.append( 2945 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2946 ) 2947 if param.get("annotation_snpeff", None) != None: 2948 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2949 if param.get("annotation_bcftools", None) != None: 2950 if isinstance(param.get("annotation_bcftools", None), list): 2951 param_annotation_list.append( 2952 "bcftools:" 2953 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2954 ) 2955 else: 2956 param_annotation_list.append( 2957 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2958 ) 2959 if param.get("annotation_annovar", None) != None: 2960 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2961 if param.get("annotation_exomiser", None) != None: 2962 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2963 if param.get("annotation_splice", None) != None: 2964 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2965 2966 # Merge param annotations list 2967 param["annotations"] = ",".join(param_annotation_list) 2968 2969 # debug 2970 log.debug(f"param_annotations={param['annotations']}") 2971 2972 if param.get("annotations"): 2973 2974 # Log 2975 # log.info("Annotations - Check annotation parameters") 2976 2977 if not "annotation" in param: 2978 param["annotation"] = {} 2979 2980 # List of annotations parameters 2981 annotations_list_input = {} 2982 if isinstance(param.get("annotations", None), str): 2983 annotation_file_list = [ 2984 value for value in param.get("annotations", "").split(",") 2985 ] 2986 for annotation_file in annotation_file_list: 2987 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2988 else: 2989 annotations_list_input = param.get("annotations", {}) 2990 2991 log.info(f"Quick Annotations:") 2992 for annotation_key in list(annotations_list_input.keys()): 2993 log.info(f" {annotation_key}") 2994 2995 # List of annotations and associated fields 2996 annotations_list = {} 2997 2998 for annotation_file in annotations_list_input: 2999 3000 # Explode annotations if ALL 3001 if ( 3002 annotation_file.upper() == "ALL" 3003 or annotation_file.upper().startswith("ALL:") 3004 ): 3005 3006 # check ALL parameters (formats, releases) 3007 annotation_file_split = annotation_file.split(":") 3008 database_formats = "parquet" 3009 database_releases = "current" 3010 for annotation_file_option in annotation_file_split[1:]: 3011 database_all_options_split = annotation_file_option.split("=") 3012 if database_all_options_split[0] == "format": 3013 database_formats = database_all_options_split[1].split("+") 3014 if database_all_options_split[0] == "release": 3015 database_releases = database_all_options_split[1].split("+") 3016 3017 # Scan for availabled databases 3018 databases_infos_dict = self.scan_databases( 3019 database_formats=database_formats, 3020 database_releases=database_releases, 3021 ) 3022 3023 # Add found databases in annotation parameters 3024 for database_infos in databases_infos_dict.keys(): 3025 annotations_list[database_infos] = {"INFO": None} 3026 3027 else: 3028 annotations_list[annotation_file] = annotations_list_input[ 3029 annotation_file 3030 ] 3031 3032 # Check each databases 3033 if len(annotations_list): 3034 3035 log.info( 3036 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3037 ) 3038 3039 for annotation_file in annotations_list: 3040 3041 # Init 3042 annotations = annotations_list.get(annotation_file, None) 3043 3044 # Annotation snpEff 3045 if annotation_file.startswith("snpeff"): 3046 3047 log.debug(f"Quick Annotation snpEff") 3048 3049 if "snpeff" not in param["annotation"]: 3050 param["annotation"]["snpeff"] = {} 3051 3052 if "options" not in param["annotation"]["snpeff"]: 3053 param["annotation"]["snpeff"]["options"] = "" 3054 3055 # snpEff options in annotations 3056 param["annotation"]["snpeff"]["options"] = "".join( 3057 annotation_file.split(":")[1:] 3058 ) 3059 3060 # Annotation Annovar 3061 elif annotation_file.startswith("annovar"): 3062 3063 log.debug(f"Quick Annotation Annovar") 3064 3065 if "annovar" not in param["annotation"]: 3066 param["annotation"]["annovar"] = {} 3067 3068 if "annotations" not in param["annotation"]["annovar"]: 3069 param["annotation"]["annovar"]["annotations"] = {} 3070 3071 # Options 3072 annotation_file_split = annotation_file.split(":") 3073 for annotation_file_annotation in annotation_file_split[1:]: 3074 if annotation_file_annotation: 3075 param["annotation"]["annovar"]["annotations"][ 3076 annotation_file_annotation 3077 ] = annotations 3078 3079 # Annotation Exomiser 3080 elif annotation_file.startswith("exomiser"): 3081 3082 log.debug(f"Quick Annotation Exomiser") 3083 3084 param["annotation"]["exomiser"] = params_string_to_dict( 3085 annotation_file 3086 ) 3087 3088 # Annotation Splice 3089 elif annotation_file.startswith("splice"): 3090 3091 log.debug(f"Quick Annotation Splice") 3092 3093 param["annotation"]["splice"] = params_string_to_dict( 3094 annotation_file 3095 ) 3096 3097 # Annotation Parquet or BCFTOOLS 3098 else: 3099 3100 # Tools detection 3101 if annotation_file.startswith("bcftools:"): 3102 annotation_tool_initial = "bcftools" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("snpsift:"): 3105 annotation_tool_initial = "snpsift" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 elif annotation_file.startswith("bigwig:"): 3108 annotation_tool_initial = "bigwig" 3109 annotation_file = ":".join(annotation_file.split(":")[1:]) 3110 else: 3111 annotation_tool_initial = None 3112 3113 # list of files 3114 annotation_file_list = annotation_file.replace("+", ":").split( 3115 ":" 3116 ) 3117 3118 for annotation_file in annotation_file_list: 3119 3120 if annotation_file: 3121 3122 # Annotation tool initial 3123 annotation_tool = annotation_tool_initial 3124 3125 # Find file 3126 annotation_file_found = None 3127 3128 if os.path.exists(annotation_file): 3129 annotation_file_found = annotation_file 3130 elif os.path.exists(full_path(annotation_file)): 3131 annotation_file_found = full_path(annotation_file) 3132 else: 3133 # Find within assembly folders 3134 for annotations_database in annotations_databases: 3135 found_files = find_all( 3136 annotation_file, 3137 os.path.join( 3138 annotations_database, assembly 3139 ), 3140 ) 3141 if len(found_files) > 0: 3142 annotation_file_found = found_files[0] 3143 break 3144 if not annotation_file_found and not assembly: 3145 # Find within folders 3146 for ( 3147 annotations_database 3148 ) in annotations_databases: 3149 found_files = find_all( 3150 annotation_file, annotations_database 3151 ) 3152 if len(found_files) > 0: 3153 annotation_file_found = found_files[0] 3154 break 3155 log.debug( 3156 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3157 ) 3158 3159 # Full path 3160 annotation_file_found = full_path(annotation_file_found) 3161 3162 if annotation_file_found: 3163 3164 database = Database(database=annotation_file_found) 3165 quick_annotation_format = database.get_format() 3166 quick_annotation_is_compressed = ( 3167 database.is_compressed() 3168 ) 3169 quick_annotation_is_indexed = os.path.exists( 3170 f"{annotation_file_found}.tbi" 3171 ) 3172 bcftools_preference = False 3173 3174 # Check Annotation Tool 3175 if not annotation_tool: 3176 if ( 3177 bcftools_preference 3178 and quick_annotation_format 3179 in ["vcf", "bed"] 3180 and quick_annotation_is_compressed 3181 and quick_annotation_is_indexed 3182 ): 3183 annotation_tool = "bcftools" 3184 elif quick_annotation_format in [ 3185 "vcf", 3186 "bed", 3187 "tsv", 3188 "tsv", 3189 "csv", 3190 "json", 3191 "tbl", 3192 "parquet", 3193 "duckdb", 3194 ]: 3195 annotation_tool = "parquet" 3196 elif quick_annotation_format in ["bw"]: 3197 annotation_tool = "bigwig" 3198 else: 3199 log.error( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 raise ValueError( 3203 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3204 ) 3205 3206 log.debug( 3207 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3208 ) 3209 3210 # Annotation Tool dispatch 3211 if annotation_tool: 3212 if annotation_tool not in param["annotation"]: 3213 param["annotation"][annotation_tool] = {} 3214 if ( 3215 "annotations" 3216 not in param["annotation"][annotation_tool] 3217 ): 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ] = {} 3221 param["annotation"][annotation_tool][ 3222 "annotations" 3223 ][annotation_file_found] = annotations 3224 3225 else: 3226 log.warning( 3227 f"Quick Annotation File {annotation_file} does NOT exist" 3228 ) 3229 3230 self.set_param(param) 3231 3232 if param.get("annotation", None): 3233 log.info("Annotations") 3234 if param.get("annotation", {}).get("parquet", None): 3235 log.info("Annotations 'parquet'...") 3236 self.annotation_parquet() 3237 if param.get("annotation", {}).get("bcftools", None): 3238 log.info("Annotations 'bcftools'...") 3239 self.annotation_bcftools() 3240 if param.get("annotation", {}).get("snpsift", None): 3241 log.info("Annotations 'snpsift'...") 3242 self.annotation_snpsift() 3243 if param.get("annotation", {}).get("bigwig", None): 3244 log.info("Annotations 'bigwig'...") 3245 self.annotation_bigwig() 3246 if param.get("annotation", {}).get("annovar", None): 3247 log.info("Annotations 'annovar'...") 3248 self.annotation_annovar() 3249 if param.get("annotation", {}).get("snpeff", None): 3250 log.info("Annotations 'snpeff'...") 3251 self.annotation_snpeff() 3252 if param.get("annotation", {}).get("exomiser", None) is not None: 3253 log.info("Annotations 'exomiser'...") 3254 self.annotation_exomiser() 3255 if param.get("annotation", {}).get("splice", None) is not None: 3256 log.info("Annotations 'splice' ...") 3257 self.annotation_splice() 3258 3259 # Explode INFOS fields into table fields 3260 if self.get_explode_infos(): 3261 self.explode_infos( 3262 prefix=self.get_explode_infos_prefix(), 3263 fields=self.get_explode_infos_fields(), 3264 force=True, 3265 )
It annotates the VCF file with the annotations specified in the config file.
3267 def annotation_bigwig(self, threads: int = None) -> None: 3268 """ 3269 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3270 3271 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3272 number of threads to be used for parallel processing during the annotation process. If the 3273 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3274 threads to use based on the system configuration 3275 :type threads: int 3276 :return: True 3277 """ 3278 3279 # DEBUG 3280 log.debug("Start annotation with bigwig databases") 3281 3282 # # Threads 3283 # if not threads: 3284 # threads = self.get_threads() 3285 # log.debug("Threads: " + str(threads)) 3286 3287 # Config 3288 config = self.get_config() 3289 log.debug("Config: " + str(config)) 3290 3291 # Config - BCFTools databases folders 3292 databases_folders = set( 3293 self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("annotations", ["."]) 3297 + self.get_config() 3298 .get("folders", {}) 3299 .get("databases", {}) 3300 .get("bigwig", ["."]) 3301 ) 3302 log.debug("Databases annotations: " + str(databases_folders)) 3303 3304 # Param 3305 annotations = ( 3306 self.get_param() 3307 .get("annotation", {}) 3308 .get("bigwig", {}) 3309 .get("annotations", None) 3310 ) 3311 log.debug("Annotations: " + str(annotations)) 3312 3313 # Assembly 3314 assembly = self.get_param().get( 3315 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3316 ) 3317 3318 # Data 3319 table_variants = self.get_table_variants() 3320 3321 # Check if not empty 3322 log.debug("Check if not empty") 3323 sql_query_chromosomes = ( 3324 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3325 ) 3326 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3327 if not sql_query_chromosomes_df["count"][0]: 3328 log.info(f"VCF empty") 3329 return 3330 3331 # VCF header 3332 vcf_reader = self.get_header() 3333 log.debug("Initial header: " + str(vcf_reader.infos)) 3334 3335 # Existing annotations 3336 for vcf_annotation in self.get_header().infos: 3337 3338 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3339 log.debug( 3340 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3341 ) 3342 3343 if annotations: 3344 3345 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3346 3347 # Export VCF file 3348 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3349 3350 # annotation_bigwig_config 3351 annotation_bigwig_config_list = [] 3352 3353 for annotation in annotations: 3354 annotation_fields = annotations[annotation] 3355 3356 # Annotation Name 3357 annotation_name = os.path.basename(annotation) 3358 3359 if not annotation_fields: 3360 annotation_fields = {"INFO": None} 3361 3362 log.debug(f"Annotation '{annotation_name}'") 3363 log.debug( 3364 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3365 ) 3366 3367 # Create Database 3368 database = Database( 3369 database=annotation, 3370 databases_folders=databases_folders, 3371 assembly=assembly, 3372 ) 3373 3374 # Find files 3375 db_file = database.get_database() 3376 db_file = full_path(db_file) 3377 db_hdr_file = database.get_header_file() 3378 db_hdr_file = full_path(db_hdr_file) 3379 db_file_type = database.get_format() 3380 3381 # If db_file is http ? 3382 if database.get_database().startswith("http"): 3383 3384 # Datbase is HTTP URL 3385 db_file_is_http = True 3386 3387 # DB file keep as URL 3388 db_file = database.get_database() 3389 log.warning( 3390 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3391 ) 3392 3393 # Retrieve automatic annotation field name 3394 annotation_field = clean_annotation_field( 3395 os.path.basename(db_file).replace(".bw", "") 3396 ) 3397 log.debug( 3398 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3399 ) 3400 3401 # Create automatic header file 3402 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3403 with open(db_hdr_file, "w") as f: 3404 f.write("##fileformat=VCFv4.2\n") 3405 f.write( 3406 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3407 ) 3408 f.write(f"#CHROM START END {annotation_field}\n") 3409 3410 else: 3411 3412 # Datbase is NOT HTTP URL 3413 db_file_is_http = False 3414 3415 # Check index - try to create if not exists 3416 if ( 3417 db_file is None 3418 or db_hdr_file is None 3419 or (not os.path.exists(db_file) and not db_file_is_http) 3420 or not os.path.exists(db_hdr_file) 3421 or not db_file_type in ["bw"] 3422 ): 3423 # if False: 3424 log.error("Annotation failed: database not valid") 3425 log.error(f"Annotation annotation file: {db_file}") 3426 log.error(f"Annotation annotation file type: {db_file_type}") 3427 log.error(f"Annotation annotation header: {db_hdr_file}") 3428 raise ValueError( 3429 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3430 ) 3431 else: 3432 3433 # Log 3434 log.debug( 3435 f"Annotation '{annotation}' - file: " 3436 + str(db_file) 3437 + " and " 3438 + str(db_hdr_file) 3439 ) 3440 3441 # Load header as VCF object 3442 db_hdr_vcf = Variants(input=db_hdr_file) 3443 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3444 log.debug( 3445 "Annotation database header: " 3446 + str(db_hdr_vcf_header_infos) 3447 ) 3448 3449 # For all fields in database 3450 annotation_fields_full = False 3451 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3452 annotation_fields = { 3453 key: key for key in db_hdr_vcf_header_infos 3454 } 3455 log.debug( 3456 "Annotation database header - All annotations added: " 3457 + str(annotation_fields) 3458 ) 3459 annotation_fields_full = True 3460 3461 # Init 3462 cyvcf2_header_rename_dict = {} 3463 cyvcf2_header_list = [] 3464 cyvcf2_header_indexes = {} 3465 3466 # process annotation fields 3467 for annotation_field in annotation_fields: 3468 3469 # New annotation name 3470 annotation_field_new = annotation_fields[annotation_field] 3471 3472 # Check annotation field and index in header 3473 if ( 3474 annotation_field 3475 in db_hdr_vcf.get_header_columns_as_list() 3476 ): 3477 annotation_field_index = ( 3478 db_hdr_vcf.get_header_columns_as_list().index( 3479 annotation_field 3480 ) 3481 - 3 3482 ) 3483 cyvcf2_header_indexes[annotation_field_new] = ( 3484 annotation_field_index 3485 ) 3486 else: 3487 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3488 log.error(msg_err) 3489 raise ValueError(msg_err) 3490 3491 # Append annotation field in cyvcf2 header list 3492 cyvcf2_header_rename_dict[annotation_field_new] = ( 3493 db_hdr_vcf_header_infos[annotation_field].id 3494 ) 3495 cyvcf2_header_list.append( 3496 { 3497 "ID": annotation_field_new, 3498 "Number": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].num, 3501 "Type": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].type, 3504 "Description": db_hdr_vcf_header_infos[ 3505 annotation_field 3506 ].desc, 3507 } 3508 ) 3509 3510 # Add header on VCF 3511 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3512 annotation_field_new, 3513 db_hdr_vcf_header_infos[annotation_field].num, 3514 db_hdr_vcf_header_infos[annotation_field].type, 3515 db_hdr_vcf_header_infos[annotation_field].desc, 3516 "HOWARD BigWig annotation", 3517 "unknown", 3518 self.code_type_map[ 3519 db_hdr_vcf_header_infos[annotation_field].type 3520 ], 3521 ) 3522 3523 # Load bigwig database 3524 bw_db = pyBigWig.open(db_file) 3525 if bw_db.isBigWig(): 3526 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3527 else: 3528 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3529 log.error(msg_err) 3530 raise ValueError(msg_err) 3531 3532 annotation_bigwig_config_list.append( 3533 { 3534 "db_file": db_file, 3535 "bw_db": bw_db, 3536 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3537 "cyvcf2_header_list": cyvcf2_header_list, 3538 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3539 } 3540 ) 3541 3542 # Annotate 3543 if annotation_bigwig_config_list: 3544 3545 # Annotation config 3546 log.debug( 3547 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3548 ) 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 3558 # Load input tmp file 3559 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3560 3561 # Add header in input file 3562 for annotation_bigwig_config in annotation_bigwig_config_list: 3563 for cyvcf2_header_field in annotation_bigwig_config.get( 3564 "cyvcf2_header_list", [] 3565 ): 3566 log.info( 3567 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3568 ) 3569 input_vcf.add_info_to_header(cyvcf2_header_field) 3570 3571 # Create output VCF file 3572 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3573 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3574 3575 # Fetch variants 3576 log.info(f"Annotations 'bigwig' start...") 3577 for variant in input_vcf: 3578 3579 for annotation_bigwig_config in annotation_bigwig_config_list: 3580 3581 # DB and indexes 3582 bw_db = annotation_bigwig_config.get("bw_db", None) 3583 cyvcf2_header_indexes = annotation_bigwig_config.get( 3584 "cyvcf2_header_indexes", None 3585 ) 3586 3587 # Retrieve value from chrom pos 3588 res = bw_db.values( 3589 variant.CHROM, variant.POS - 1, variant.POS 3590 ) 3591 3592 # For each annotation fields (and indexes) 3593 for cyvcf2_header_index in cyvcf2_header_indexes: 3594 3595 # If value is NOT nNone 3596 if not np.isnan( 3597 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3598 ): 3599 variant.INFO[cyvcf2_header_index] = res[ 3600 cyvcf2_header_indexes[cyvcf2_header_index] 3601 ] 3602 3603 # Add record in output file 3604 output_vcf.write_record(variant) 3605 3606 # Log 3607 log.debug(f"Annotation done.") 3608 3609 # Close and write file 3610 log.info(f"Annotations 'bigwig' write...") 3611 output_vcf.close() 3612 log.debug(f"Write done.") 3613 3614 # Update variants 3615 log.info(f"Annotations 'bigwig' update...") 3616 self.update_from_vcf(output_vcf_file) 3617 log.debug(f"Update done.") 3618 3619 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3621 def annotation_snpsift(self, threads: int = None) -> None: 3622 """ 3623 This function annotate with bcftools 3624 3625 :param threads: Number of threads to use 3626 :return: the value of the variable "return_value". 3627 """ 3628 3629 # DEBUG 3630 log.debug("Start annotation with bcftools databases") 3631 3632 # Threads 3633 if not threads: 3634 threads = self.get_threads() 3635 log.debug("Threads: " + str(threads)) 3636 3637 # Config 3638 config = self.get_config() 3639 log.debug("Config: " + str(config)) 3640 3641 # Config - snpSift 3642 snpsift_bin_command = get_bin_command( 3643 bin="SnpSift.jar", 3644 tool="snpsift", 3645 bin_type="jar", 3646 config=config, 3647 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3648 ) 3649 if not snpsift_bin_command: 3650 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3651 log.error(msg_err) 3652 raise ValueError(msg_err) 3653 3654 # Config - bcftools 3655 bcftools_bin_command = get_bin_command( 3656 bin="bcftools", 3657 tool="bcftools", 3658 bin_type="bin", 3659 config=config, 3660 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3661 ) 3662 if not bcftools_bin_command: 3663 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3664 log.error(msg_err) 3665 raise ValueError(msg_err) 3666 3667 # Config - BCFTools databases folders 3668 databases_folders = set( 3669 self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("annotations", ["."]) 3673 + self.get_config() 3674 .get("folders", {}) 3675 .get("databases", {}) 3676 .get("bcftools", ["."]) 3677 ) 3678 log.debug("Databases annotations: " + str(databases_folders)) 3679 3680 # Param 3681 annotations = ( 3682 self.get_param() 3683 .get("annotation", {}) 3684 .get("snpsift", {}) 3685 .get("annotations", None) 3686 ) 3687 log.debug("Annotations: " + str(annotations)) 3688 3689 # Assembly 3690 assembly = self.get_param().get( 3691 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3692 ) 3693 3694 # Data 3695 table_variants = self.get_table_variants() 3696 3697 # Check if not empty 3698 log.debug("Check if not empty") 3699 sql_query_chromosomes = ( 3700 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3701 ) 3702 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3703 if not sql_query_chromosomes_df["count"][0]: 3704 log.info(f"VCF empty") 3705 return 3706 3707 # VCF header 3708 vcf_reader = self.get_header() 3709 log.debug("Initial header: " + str(vcf_reader.infos)) 3710 3711 # Existing annotations 3712 for vcf_annotation in self.get_header().infos: 3713 3714 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3715 log.debug( 3716 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3717 ) 3718 3719 if annotations: 3720 3721 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3722 3723 # Export VCF file 3724 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3725 3726 # Init 3727 commands = {} 3728 3729 for annotation in annotations: 3730 annotation_fields = annotations[annotation] 3731 3732 # Annotation Name 3733 annotation_name = os.path.basename(annotation) 3734 3735 if not annotation_fields: 3736 annotation_fields = {"INFO": None} 3737 3738 log.debug(f"Annotation '{annotation_name}'") 3739 log.debug( 3740 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3741 ) 3742 3743 # Create Database 3744 database = Database( 3745 database=annotation, 3746 databases_folders=databases_folders, 3747 assembly=assembly, 3748 ) 3749 3750 # Find files 3751 db_file = database.get_database() 3752 db_file = full_path(db_file) 3753 db_hdr_file = database.get_header_file() 3754 db_hdr_file = full_path(db_hdr_file) 3755 db_file_type = database.get_format() 3756 db_tbi_file = f"{db_file}.tbi" 3757 db_file_compressed = database.is_compressed() 3758 3759 # Check if compressed 3760 if not db_file_compressed: 3761 log.error( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 raise ValueError( 3765 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3766 ) 3767 3768 # Check if indexed 3769 if not os.path.exists(db_tbi_file): 3770 log.error( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 raise ValueError( 3774 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3775 ) 3776 3777 # Check index - try to create if not exists 3778 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3779 log.error("Annotation failed: database not valid") 3780 log.error(f"Annotation annotation file: {db_file}") 3781 log.error(f"Annotation annotation header: {db_hdr_file}") 3782 log.error(f"Annotation annotation index: {db_tbi_file}") 3783 raise ValueError( 3784 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3785 ) 3786 else: 3787 3788 log.debug( 3789 f"Annotation '{annotation}' - file: " 3790 + str(db_file) 3791 + " and " 3792 + str(db_hdr_file) 3793 ) 3794 3795 # Load header as VCF object 3796 db_hdr_vcf = Variants(input=db_hdr_file) 3797 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3798 log.debug( 3799 "Annotation database header: " 3800 + str(db_hdr_vcf_header_infos) 3801 ) 3802 3803 # For all fields in database 3804 annotation_fields_full = False 3805 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3806 annotation_fields = { 3807 key: key for key in db_hdr_vcf_header_infos 3808 } 3809 log.debug( 3810 "Annotation database header - All annotations added: " 3811 + str(annotation_fields) 3812 ) 3813 annotation_fields_full = True 3814 3815 # # Create file for field rename 3816 # log.debug("Create file for field rename") 3817 # tmp_rename = NamedTemporaryFile( 3818 # prefix=self.get_prefix(), 3819 # dir=self.get_tmp_dir(), 3820 # suffix=".rename", 3821 # delete=False, 3822 # ) 3823 # tmp_rename_name = tmp_rename.name 3824 # tmp_files.append(tmp_rename_name) 3825 3826 # Number of fields 3827 nb_annotation_field = 0 3828 annotation_list = [] 3829 annotation_infos_rename_list = [] 3830 3831 for annotation_field in annotation_fields: 3832 3833 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3834 annotation_fields_new_name = annotation_fields.get( 3835 annotation_field, annotation_field 3836 ) 3837 if not annotation_fields_new_name: 3838 annotation_fields_new_name = annotation_field 3839 3840 # Check if field is in DB and if field is not elready in input data 3841 if ( 3842 annotation_field in db_hdr_vcf.get_header().infos 3843 and annotation_fields_new_name 3844 not in self.get_header().infos 3845 ): 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3849 ) 3850 3851 # BCFTools annotate param to rename fields 3852 if annotation_field != annotation_fields_new_name: 3853 annotation_infos_rename_list.append( 3854 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3855 ) 3856 3857 # Add INFO field to header 3858 db_hdr_vcf_header_infos_number = ( 3859 db_hdr_vcf_header_infos[annotation_field].num or "." 3860 ) 3861 db_hdr_vcf_header_infos_type = ( 3862 db_hdr_vcf_header_infos[annotation_field].type 3863 or "String" 3864 ) 3865 db_hdr_vcf_header_infos_description = ( 3866 db_hdr_vcf_header_infos[annotation_field].desc 3867 or f"{annotation_field} description" 3868 ) 3869 db_hdr_vcf_header_infos_source = ( 3870 db_hdr_vcf_header_infos[annotation_field].source 3871 or "unknown" 3872 ) 3873 db_hdr_vcf_header_infos_version = ( 3874 db_hdr_vcf_header_infos[annotation_field].version 3875 or "unknown" 3876 ) 3877 3878 vcf_reader.infos[annotation_fields_new_name] = ( 3879 vcf.parser._Info( 3880 annotation_fields_new_name, 3881 db_hdr_vcf_header_infos_number, 3882 db_hdr_vcf_header_infos_type, 3883 db_hdr_vcf_header_infos_description, 3884 db_hdr_vcf_header_infos_source, 3885 db_hdr_vcf_header_infos_version, 3886 self.code_type_map[ 3887 db_hdr_vcf_header_infos_type 3888 ], 3889 ) 3890 ) 3891 3892 annotation_list.append(annotation_field) 3893 3894 nb_annotation_field += 1 3895 3896 else: 3897 3898 if ( 3899 annotation_field 3900 not in db_hdr_vcf.get_header().infos 3901 ): 3902 log.warning( 3903 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3904 ) 3905 if ( 3906 annotation_fields_new_name 3907 in self.get_header().infos 3908 ): 3909 log.warning( 3910 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3911 ) 3912 3913 log.info( 3914 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3915 ) 3916 3917 annotation_infos = ",".join(annotation_list) 3918 3919 if annotation_infos != "": 3920 3921 # Annotated VCF (and error file) 3922 tmp_annotation_vcf_name = os.path.join( 3923 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3924 ) 3925 tmp_annotation_vcf_name_err = ( 3926 tmp_annotation_vcf_name + ".err" 3927 ) 3928 3929 # Add fields to annotate 3930 if not annotation_fields_full: 3931 annotation_infos_option = f"-info {annotation_infos}" 3932 else: 3933 annotation_infos_option = "" 3934 3935 # Info fields rename 3936 if annotation_infos_rename_list: 3937 annotation_infos_rename = " -c " + ",".join( 3938 annotation_infos_rename_list 3939 ) 3940 else: 3941 annotation_infos_rename = "" 3942 3943 # Annotate command 3944 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3945 3946 # Add command 3947 commands[command_annotate] = tmp_annotation_vcf_name 3948 3949 if commands: 3950 3951 # Export VCF file 3952 self.export_variant_vcf( 3953 vcf_file=tmp_vcf_name, 3954 remove_info=True, 3955 add_samples=False, 3956 index=True, 3957 ) 3958 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3959 3960 # Num command 3961 nb_command = 0 3962 3963 # Annotate 3964 for command_annotate in commands: 3965 nb_command += 1 3966 log.info( 3967 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3968 ) 3969 log.debug(f"command_annotate={command_annotate}") 3970 run_parallel_commands([command_annotate], threads) 3971 3972 # Debug 3973 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3974 3975 # Update variants 3976 log.info( 3977 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3978 ) 3979 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3981 def annotation_bcftools(self, threads: int = None) -> None: 3982 """ 3983 This function annotate with bcftools 3984 3985 :param threads: Number of threads to use 3986 :return: the value of the variable "return_value". 3987 """ 3988 3989 # DEBUG 3990 log.debug("Start annotation with bcftools databases") 3991 3992 # Threads 3993 if not threads: 3994 threads = self.get_threads() 3995 log.debug("Threads: " + str(threads)) 3996 3997 # Config 3998 config = self.get_config() 3999 log.debug("Config: " + str(config)) 4000 4001 # DEBUG 4002 delete_tmp = True 4003 if self.get_config().get("verbosity", "warning") in ["debug"]: 4004 delete_tmp = False 4005 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4006 4007 # Config - BCFTools bin command 4008 bcftools_bin_command = get_bin_command( 4009 bin="bcftools", 4010 tool="bcftools", 4011 bin_type="bin", 4012 config=config, 4013 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4014 ) 4015 if not bcftools_bin_command: 4016 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4017 log.error(msg_err) 4018 raise ValueError(msg_err) 4019 4020 # Config - BCFTools databases folders 4021 databases_folders = set( 4022 self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("annotations", ["."]) 4026 + self.get_config() 4027 .get("folders", {}) 4028 .get("databases", {}) 4029 .get("bcftools", ["."]) 4030 ) 4031 log.debug("Databases annotations: " + str(databases_folders)) 4032 4033 # Param 4034 annotations = ( 4035 self.get_param() 4036 .get("annotation", {}) 4037 .get("bcftools", {}) 4038 .get("annotations", None) 4039 ) 4040 log.debug("Annotations: " + str(annotations)) 4041 4042 # Assembly 4043 assembly = self.get_param().get( 4044 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4045 ) 4046 4047 # Data 4048 table_variants = self.get_table_variants() 4049 4050 # Check if not empty 4051 log.debug("Check if not empty") 4052 sql_query_chromosomes = ( 4053 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4054 ) 4055 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4056 if not sql_query_chromosomes_df["count"][0]: 4057 log.info(f"VCF empty") 4058 return 4059 4060 # Export in VCF 4061 log.debug("Create initial file to annotate") 4062 tmp_vcf = NamedTemporaryFile( 4063 prefix=self.get_prefix(), 4064 dir=self.get_tmp_dir(), 4065 suffix=".vcf.gz", 4066 delete=False, 4067 ) 4068 tmp_vcf_name = tmp_vcf.name 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Existing annotations 4075 for vcf_annotation in self.get_header().infos: 4076 4077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4078 log.debug( 4079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4080 ) 4081 4082 if annotations: 4083 4084 tmp_ann_vcf_list = [] 4085 commands = [] 4086 tmp_files = [] 4087 err_files = [] 4088 4089 for annotation in annotations: 4090 annotation_fields = annotations[annotation] 4091 4092 # Annotation Name 4093 annotation_name = os.path.basename(annotation) 4094 4095 if not annotation_fields: 4096 annotation_fields = {"INFO": None} 4097 4098 log.debug(f"Annotation '{annotation_name}'") 4099 log.debug( 4100 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4101 ) 4102 4103 # Create Database 4104 database = Database( 4105 database=annotation, 4106 databases_folders=databases_folders, 4107 assembly=assembly, 4108 ) 4109 4110 # Find files 4111 db_file = database.get_database() 4112 db_file = full_path(db_file) 4113 db_hdr_file = database.get_header_file() 4114 db_hdr_file = full_path(db_hdr_file) 4115 db_file_type = database.get_format() 4116 db_tbi_file = f"{db_file}.tbi" 4117 db_file_compressed = database.is_compressed() 4118 4119 # Check if compressed 4120 if not db_file_compressed: 4121 log.error( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 raise ValueError( 4125 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4126 ) 4127 4128 # Check if indexed 4129 if not os.path.exists(db_tbi_file): 4130 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4131 raise ValueError( 4132 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4133 ) 4134 4135 # Check index - try to create if not exists 4136 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4137 log.error("Annotation failed: database not valid") 4138 log.error(f"Annotation annotation file: {db_file}") 4139 log.error(f"Annotation annotation header: {db_hdr_file}") 4140 log.error(f"Annotation annotation index: {db_tbi_file}") 4141 raise ValueError( 4142 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4143 ) 4144 else: 4145 4146 log.debug( 4147 f"Annotation '{annotation}' - file: " 4148 + str(db_file) 4149 + " and " 4150 + str(db_hdr_file) 4151 ) 4152 4153 # Load header as VCF object 4154 db_hdr_vcf = Variants(input=db_hdr_file) 4155 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4156 log.debug( 4157 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4158 ) 4159 4160 # For all fields in database 4161 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4162 annotation_fields = { 4163 key: key for key in db_hdr_vcf_header_infos 4164 } 4165 log.debug( 4166 "Annotation database header - All annotations added: " 4167 + str(annotation_fields) 4168 ) 4169 4170 # Number of fields 4171 nb_annotation_field = 0 4172 annotation_list = [] 4173 4174 for annotation_field in annotation_fields: 4175 4176 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4177 annotation_fields_new_name = annotation_fields.get( 4178 annotation_field, annotation_field 4179 ) 4180 if not annotation_fields_new_name: 4181 annotation_fields_new_name = annotation_field 4182 4183 # Check if field is in DB and if field is not elready in input data 4184 if ( 4185 annotation_field in db_hdr_vcf.get_header().infos 4186 and annotation_fields_new_name 4187 not in self.get_header().infos 4188 ): 4189 4190 log.info( 4191 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4192 ) 4193 4194 # Add INFO field to header 4195 db_hdr_vcf_header_infos_number = ( 4196 db_hdr_vcf_header_infos[annotation_field].num or "." 4197 ) 4198 db_hdr_vcf_header_infos_type = ( 4199 db_hdr_vcf_header_infos[annotation_field].type 4200 or "String" 4201 ) 4202 db_hdr_vcf_header_infos_description = ( 4203 db_hdr_vcf_header_infos[annotation_field].desc 4204 or f"{annotation_field} description" 4205 ) 4206 db_hdr_vcf_header_infos_source = ( 4207 db_hdr_vcf_header_infos[annotation_field].source 4208 or "unknown" 4209 ) 4210 db_hdr_vcf_header_infos_version = ( 4211 db_hdr_vcf_header_infos[annotation_field].version 4212 or "unknown" 4213 ) 4214 4215 vcf_reader.infos[annotation_fields_new_name] = ( 4216 vcf.parser._Info( 4217 annotation_fields_new_name, 4218 db_hdr_vcf_header_infos_number, 4219 db_hdr_vcf_header_infos_type, 4220 db_hdr_vcf_header_infos_description, 4221 db_hdr_vcf_header_infos_source, 4222 db_hdr_vcf_header_infos_version, 4223 self.code_type_map[db_hdr_vcf_header_infos_type], 4224 ) 4225 ) 4226 4227 # annotation_list.append(annotation_field) 4228 if annotation_field != annotation_fields_new_name: 4229 annotation_list.append( 4230 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4231 ) 4232 else: 4233 annotation_list.append(annotation_field) 4234 4235 nb_annotation_field += 1 4236 4237 else: 4238 4239 if annotation_field not in db_hdr_vcf.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4242 ) 4243 if annotation_fields_new_name in self.get_header().infos: 4244 log.warning( 4245 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4246 ) 4247 4248 log.info( 4249 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4250 ) 4251 4252 annotation_infos = ",".join(annotation_list) 4253 4254 if annotation_infos != "": 4255 4256 # Protect header for bcftools (remove "#CHROM" and variants line) 4257 log.debug("Protect Header file - remove #CHROM line if exists") 4258 tmp_header_vcf = NamedTemporaryFile( 4259 prefix=self.get_prefix(), 4260 dir=self.get_tmp_dir(), 4261 suffix=".hdr", 4262 delete=False, 4263 ) 4264 tmp_header_vcf_name = tmp_header_vcf.name 4265 tmp_files.append(tmp_header_vcf_name) 4266 # Command 4267 if db_hdr_file.endswith(".gz"): 4268 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 else: 4270 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4271 # Run 4272 run_parallel_commands([command_extract_header], 1) 4273 4274 # Find chomosomes 4275 log.debug("Find chromosomes ") 4276 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4277 sql_query_chromosomes_df = self.get_query_to_df( 4278 sql_query_chromosomes 4279 ) 4280 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4281 4282 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4283 4284 # BED columns in the annotation file 4285 if db_file_type in ["bed"]: 4286 annotation_infos = "CHROM,POS,POS," + annotation_infos 4287 4288 for chrom in chomosomes_list: 4289 4290 # Create BED on initial VCF 4291 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4292 tmp_bed = NamedTemporaryFile( 4293 prefix=self.get_prefix(), 4294 dir=self.get_tmp_dir(), 4295 suffix=".bed", 4296 delete=False, 4297 ) 4298 tmp_bed_name = tmp_bed.name 4299 tmp_files.append(tmp_bed_name) 4300 4301 # Detecte regions 4302 log.debug( 4303 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4304 ) 4305 window = 1000000 4306 sql_query_intervals_for_bed = f""" 4307 SELECT \"#CHROM\", 4308 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4309 \"POS\"+{window} 4310 FROM {table_variants} as table_variants 4311 WHERE table_variants.\"#CHROM\" = '{chrom}' 4312 """ 4313 regions = self.conn.execute( 4314 sql_query_intervals_for_bed 4315 ).fetchall() 4316 merged_regions = merge_regions(regions) 4317 log.debug( 4318 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4319 ) 4320 4321 header = ["#CHROM", "START", "END"] 4322 with open(tmp_bed_name, "w") as f: 4323 # Write the header with tab delimiter 4324 f.write("\t".join(header) + "\n") 4325 for d in merged_regions: 4326 # Write each data row with tab delimiter 4327 f.write("\t".join(map(str, d)) + "\n") 4328 4329 # Tmp files 4330 tmp_annotation_vcf = NamedTemporaryFile( 4331 prefix=self.get_prefix(), 4332 dir=self.get_tmp_dir(), 4333 suffix=".vcf.gz", 4334 delete=False, 4335 ) 4336 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4337 tmp_files.append(tmp_annotation_vcf_name) 4338 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4339 tmp_annotation_vcf_name_err = ( 4340 tmp_annotation_vcf_name + ".err" 4341 ) 4342 err_files.append(tmp_annotation_vcf_name_err) 4343 4344 # Annotate Command 4345 log.debug( 4346 f"Annotation '{annotation}' - add bcftools command" 4347 ) 4348 4349 # Command 4350 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4351 4352 # Add command 4353 commands.append(command_annotate) 4354 4355 # if some commands 4356 if commands: 4357 4358 # Export VCF file 4359 self.export_variant_vcf( 4360 vcf_file=tmp_vcf_name, 4361 remove_info=True, 4362 add_samples=False, 4363 index=True, 4364 ) 4365 4366 # Threads 4367 # calculate threads for annotated commands 4368 if commands: 4369 threads_bcftools_annotate = round(threads / len(commands)) 4370 else: 4371 threads_bcftools_annotate = 1 4372 4373 if not threads_bcftools_annotate: 4374 threads_bcftools_annotate = 1 4375 4376 # Add threads option to bcftools commands 4377 if threads_bcftools_annotate > 1: 4378 commands_threaded = [] 4379 for command in commands: 4380 commands_threaded.append( 4381 command.replace( 4382 f"{bcftools_bin_command} annotate ", 4383 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4384 ) 4385 ) 4386 commands = commands_threaded 4387 4388 # Command annotation multithreading 4389 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4390 log.info( 4391 f"Annotation - Annotation multithreaded in " 4392 + str(len(commands)) 4393 + " commands" 4394 ) 4395 4396 run_parallel_commands(commands, threads) 4397 4398 # Merge 4399 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4400 4401 if tmp_ann_vcf_list_cmd: 4402 4403 # Tmp file 4404 tmp_annotate_vcf = NamedTemporaryFile( 4405 prefix=self.get_prefix(), 4406 dir=self.get_tmp_dir(), 4407 suffix=".vcf.gz", 4408 delete=True, 4409 ) 4410 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4411 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4412 err_files.append(tmp_annotate_vcf_name_err) 4413 4414 # Tmp file remove command 4415 tmp_files_remove_command = "" 4416 if tmp_files: 4417 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4418 4419 # Command merge 4420 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4421 log.info( 4422 f"Annotation - Annotation merging " 4423 + str(len(commands)) 4424 + " annotated files" 4425 ) 4426 log.debug(f"Annotation - merge command: {merge_command}") 4427 run_parallel_commands([merge_command], 1) 4428 4429 # Error messages 4430 log.info(f"Error/Warning messages:") 4431 error_message_command_all = [] 4432 error_message_command_warning = [] 4433 error_message_command_err = [] 4434 for err_file in err_files: 4435 with open(err_file, "r") as f: 4436 for line in f: 4437 message = line.strip() 4438 error_message_command_all.append(message) 4439 if line.startswith("[W::"): 4440 error_message_command_warning.append(message) 4441 if line.startswith("[E::"): 4442 error_message_command_err.append( 4443 f"{err_file}: " + message 4444 ) 4445 # log info 4446 for message in list( 4447 set(error_message_command_err + error_message_command_warning) 4448 ): 4449 log.info(f" {message}") 4450 # debug info 4451 for message in list(set(error_message_command_all)): 4452 log.debug(f" {message}") 4453 # failed 4454 if len(error_message_command_err): 4455 log.error("Annotation failed: Error in commands") 4456 raise ValueError("Annotation failed: Error in commands") 4457 4458 # Update variants 4459 log.info(f"Annotation - Updating...") 4460 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4462 def annotation_exomiser(self, threads: int = None) -> None: 4463 """ 4464 This function annotate with Exomiser 4465 4466 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4467 - "analysis" (dict/file): 4468 Full analysis dictionnary parameters (see Exomiser docs). 4469 Either a dict, or a file in JSON or YAML format. 4470 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4471 Default : None 4472 - "preset" (string): 4473 Analysis preset (available in config folder). 4474 Used if no full "analysis" is provided. 4475 Default: "exome" 4476 - "phenopacket" (dict/file): 4477 Samples and phenotipic features parameters (see Exomiser docs). 4478 Either a dict, or a file in JSON or YAML format. 4479 Default: None 4480 - "subject" (dict): 4481 Sample parameters (see Exomiser docs). 4482 Example: 4483 "subject": 4484 { 4485 "id": "ISDBM322017", 4486 "sex": "FEMALE" 4487 } 4488 Default: None 4489 - "sample" (string): 4490 Sample name to construct "subject" section: 4491 "subject": 4492 { 4493 "id": "<sample>", 4494 "sex": "UNKNOWN_SEX" 4495 } 4496 Default: None 4497 - "phenotypicFeatures" (dict) 4498 Phenotypic features to construct "subject" section. 4499 Example: 4500 "phenotypicFeatures": 4501 [ 4502 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4503 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4504 ] 4505 - "hpo" (list) 4506 List of HPO ids as phenotypic features. 4507 Example: 4508 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4509 Default: [] 4510 - "outputOptions" (dict): 4511 Output options (see Exomiser docs). 4512 Default: 4513 "output_options" = 4514 { 4515 "outputContributingVariantsOnly": False, 4516 "numGenes": 0, 4517 "outputFormats": ["TSV_VARIANT", "VCF"] 4518 } 4519 - "transcript_source" (string): 4520 Transcript source (either "refseq", "ucsc", "ensembl") 4521 Default: "refseq" 4522 - "exomiser_to_info" (boolean): 4523 Add exomiser TSV file columns as INFO fields in VCF. 4524 Default: False 4525 - "release" (string): 4526 Exomise database release. 4527 If not exists, database release will be downloaded (take a while). 4528 Default: None (provided by application.properties configuration file) 4529 - "exomiser_application_properties" (file): 4530 Exomiser configuration file (see Exomiser docs). 4531 Useful to automatically download databases (especially for specific genome databases). 4532 4533 Notes: 4534 - If no sample in parameters, first sample in VCF will be chosen 4535 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4536 4537 :param threads: The number of threads to use 4538 :return: None. 4539 """ 4540 4541 # DEBUG 4542 log.debug("Start annotation with Exomiser databases") 4543 4544 # Threads 4545 if not threads: 4546 threads = self.get_threads() 4547 log.debug("Threads: " + str(threads)) 4548 4549 # Config 4550 config = self.get_config() 4551 log.debug("Config: " + str(config)) 4552 4553 # Config - Folders - Databases 4554 databases_folders = ( 4555 config.get("folders", {}) 4556 .get("databases", {}) 4557 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4558 ) 4559 databases_folders = full_path(databases_folders) 4560 if not os.path.exists(databases_folders): 4561 log.error(f"Databases annotations: {databases_folders} NOT found") 4562 log.debug("Databases annotations: " + str(databases_folders)) 4563 4564 # Config - Exomiser 4565 exomiser_bin_command = get_bin_command( 4566 bin="exomiser-cli*.jar", 4567 tool="exomiser", 4568 bin_type="jar", 4569 config=config, 4570 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4571 ) 4572 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4573 if not exomiser_bin_command: 4574 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4575 log.error(msg_err) 4576 raise ValueError(msg_err) 4577 4578 # Param 4579 param = self.get_param() 4580 log.debug("Param: " + str(param)) 4581 4582 # Param - Exomiser 4583 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4584 log.debug(f"Param Exomiser: {param_exomiser}") 4585 4586 # Param - Assembly 4587 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4588 log.debug("Assembly: " + str(assembly)) 4589 4590 # Data 4591 table_variants = self.get_table_variants() 4592 4593 # Check if not empty 4594 log.debug("Check if not empty") 4595 sql_query_chromosomes = ( 4596 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4597 ) 4598 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4599 log.info(f"VCF empty") 4600 return False 4601 4602 # VCF header 4603 vcf_reader = self.get_header() 4604 log.debug("Initial header: " + str(vcf_reader.infos)) 4605 4606 # Samples 4607 samples = self.get_header_sample_list() 4608 if not samples: 4609 log.error("No Samples in VCF") 4610 return False 4611 log.debug(f"Samples: {samples}") 4612 4613 # Memory limit 4614 memory_limit = self.get_memory("8G") 4615 log.debug(f"memory_limit: {memory_limit}") 4616 4617 # Exomiser java options 4618 exomiser_java_options = ( 4619 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4620 ) 4621 log.debug(f"Exomiser java options: {exomiser_java_options}") 4622 4623 # Download Exomiser (if not exists) 4624 exomiser_release = param_exomiser.get("release", None) 4625 exomiser_application_properties = param_exomiser.get( 4626 "exomiser_application_properties", None 4627 ) 4628 databases_download_exomiser( 4629 assemblies=[assembly], 4630 exomiser_folder=databases_folders, 4631 exomiser_release=exomiser_release, 4632 exomiser_phenotype_release=exomiser_release, 4633 exomiser_application_properties=exomiser_application_properties, 4634 ) 4635 4636 # Force annotation 4637 force_update_annotation = True 4638 4639 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4640 log.debug("Start annotation Exomiser") 4641 4642 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4643 4644 # tmp_dir = "/tmp/exomiser" 4645 4646 ### ANALYSIS ### 4647 ################ 4648 4649 # Create analysis.json through analysis dict 4650 # either analysis in param or by default 4651 # depending on preset exome/genome) 4652 4653 # Init analysis dict 4654 param_exomiser_analysis_dict = {} 4655 4656 # analysis from param 4657 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4658 param_exomiser_analysis = full_path(param_exomiser_analysis) 4659 4660 # If analysis in param -> load anlaysis json 4661 if param_exomiser_analysis: 4662 4663 # If param analysis is a file and exists 4664 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4665 param_exomiser_analysis 4666 ): 4667 # Load analysis file into analysis dict (either yaml or json) 4668 with open(param_exomiser_analysis) as json_file: 4669 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4670 4671 # If param analysis is a dict 4672 elif isinstance(param_exomiser_analysis, dict): 4673 # Load analysis dict into analysis dict (either yaml or json) 4674 param_exomiser_analysis_dict = param_exomiser_analysis 4675 4676 # Error analysis type 4677 else: 4678 log.error(f"Analysis type unknown. Check param file.") 4679 raise ValueError(f"Analysis type unknown. Check param file.") 4680 4681 # Case no input analysis config file/dict 4682 # Use preset (exome/genome) to open default config file 4683 if not param_exomiser_analysis_dict: 4684 4685 # default preset 4686 default_preset = "exome" 4687 4688 # Get param preset or default preset 4689 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4690 4691 # Try to find if preset is a file 4692 if os.path.exists(param_exomiser_preset): 4693 # Preset file is provided in full path 4694 param_exomiser_analysis_default_config_file = ( 4695 param_exomiser_preset 4696 ) 4697 # elif os.path.exists(full_path(param_exomiser_preset)): 4698 # # Preset file is provided in full path 4699 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4700 elif os.path.exists( 4701 os.path.join(folder_config, param_exomiser_preset) 4702 ): 4703 # Preset file is provided a basename in config folder (can be a path with subfolders) 4704 param_exomiser_analysis_default_config_file = os.path.join( 4705 folder_config, param_exomiser_preset 4706 ) 4707 else: 4708 # Construct preset file 4709 param_exomiser_analysis_default_config_file = os.path.join( 4710 folder_config, 4711 f"preset-{param_exomiser_preset}-analysis.json", 4712 ) 4713 4714 # If preset file exists 4715 param_exomiser_analysis_default_config_file = full_path( 4716 param_exomiser_analysis_default_config_file 4717 ) 4718 if os.path.exists(param_exomiser_analysis_default_config_file): 4719 # Load prest file into analysis dict (either yaml or json) 4720 with open( 4721 param_exomiser_analysis_default_config_file 4722 ) as json_file: 4723 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4724 json_file 4725 ) 4726 4727 # Error preset file 4728 else: 4729 log.error( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 raise ValueError( 4733 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4734 ) 4735 4736 # If no analysis dict created 4737 if not param_exomiser_analysis_dict: 4738 log.error(f"No analysis config") 4739 raise ValueError(f"No analysis config") 4740 4741 # Log 4742 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4743 4744 ### PHENOPACKET ### 4745 ################### 4746 4747 # If no PhenoPacket in analysis dict -> check in param 4748 if "phenopacket" not in param_exomiser_analysis_dict: 4749 4750 # If PhenoPacket in param -> load anlaysis json 4751 if param_exomiser.get("phenopacket", None): 4752 4753 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4754 param_exomiser_phenopacket = full_path( 4755 param_exomiser_phenopacket 4756 ) 4757 4758 # If param phenopacket is a file and exists 4759 if isinstance( 4760 param_exomiser_phenopacket, str 4761 ) and os.path.exists(param_exomiser_phenopacket): 4762 # Load phenopacket file into analysis dict (either yaml or json) 4763 with open(param_exomiser_phenopacket) as json_file: 4764 param_exomiser_analysis_dict["phenopacket"] = ( 4765 yaml.safe_load(json_file) 4766 ) 4767 4768 # If param phenopacket is a dict 4769 elif isinstance(param_exomiser_phenopacket, dict): 4770 # Load phenopacket dict into analysis dict (either yaml or json) 4771 param_exomiser_analysis_dict["phenopacket"] = ( 4772 param_exomiser_phenopacket 4773 ) 4774 4775 # Error phenopacket type 4776 else: 4777 log.error(f"Phenopacket type unknown. Check param file.") 4778 raise ValueError( 4779 f"Phenopacket type unknown. Check param file." 4780 ) 4781 4782 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4783 if "phenopacket" not in param_exomiser_analysis_dict: 4784 4785 # Init PhenoPacket 4786 param_exomiser_analysis_dict["phenopacket"] = { 4787 "id": "analysis", 4788 "proband": {}, 4789 } 4790 4791 ### Add subject ### 4792 4793 # If subject exists 4794 param_exomiser_subject = param_exomiser.get("subject", {}) 4795 4796 # If subject not exists -> found sample ID 4797 if not param_exomiser_subject: 4798 4799 # Found sample ID in param 4800 sample = param_exomiser.get("sample", None) 4801 4802 # Find sample ID (first sample) 4803 if not sample: 4804 sample_list = self.get_header_sample_list() 4805 if len(sample_list) > 0: 4806 sample = sample_list[0] 4807 else: 4808 log.error(f"No sample found") 4809 raise ValueError(f"No sample found") 4810 4811 # Create subject 4812 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4813 4814 # Add to dict 4815 param_exomiser_analysis_dict["phenopacket"][ 4816 "subject" 4817 ] = param_exomiser_subject 4818 4819 ### Add "phenotypicFeatures" ### 4820 4821 # If phenotypicFeatures exists 4822 param_exomiser_phenotypicfeatures = param_exomiser.get( 4823 "phenotypicFeatures", [] 4824 ) 4825 4826 # If phenotypicFeatures not exists -> Try to infer from hpo list 4827 if not param_exomiser_phenotypicfeatures: 4828 4829 # Found HPO in param 4830 param_exomiser_hpo = param_exomiser.get("hpo", []) 4831 4832 # Split HPO if list in string format separated by comma 4833 if isinstance(param_exomiser_hpo, str): 4834 param_exomiser_hpo = param_exomiser_hpo.split(",") 4835 4836 # Create HPO list 4837 for hpo in param_exomiser_hpo: 4838 hpo_clean = re.sub("[^0-9]", "", hpo) 4839 param_exomiser_phenotypicfeatures.append( 4840 { 4841 "type": { 4842 "id": f"HP:{hpo_clean}", 4843 "label": f"HP:{hpo_clean}", 4844 } 4845 } 4846 ) 4847 4848 # Add to dict 4849 param_exomiser_analysis_dict["phenopacket"][ 4850 "phenotypicFeatures" 4851 ] = param_exomiser_phenotypicfeatures 4852 4853 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4854 if not param_exomiser_phenotypicfeatures: 4855 for step in param_exomiser_analysis_dict.get( 4856 "analysis", {} 4857 ).get("steps", []): 4858 if "hiPhivePrioritiser" in step: 4859 param_exomiser_analysis_dict.get("analysis", {}).get( 4860 "steps", [] 4861 ).remove(step) 4862 4863 ### Add Input File ### 4864 4865 # Initial file name and htsFiles 4866 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4867 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4868 { 4869 "uri": tmp_vcf_name, 4870 "htsFormat": "VCF", 4871 "genomeAssembly": assembly, 4872 } 4873 ] 4874 4875 ### Add metaData ### 4876 4877 # If metaData not in analysis dict 4878 if "metaData" not in param_exomiser_analysis_dict: 4879 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4880 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4881 "createdBy": "howard", 4882 "phenopacketSchemaVersion": 1, 4883 } 4884 4885 ### OutputOptions ### 4886 4887 # Init output result folder 4888 output_results = os.path.join(tmp_dir, "results") 4889 4890 # If no outputOptions in analysis dict 4891 if "outputOptions" not in param_exomiser_analysis_dict: 4892 4893 # default output formats 4894 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4895 4896 # Get outputOptions in param 4897 output_options = param_exomiser.get("outputOptions", None) 4898 4899 # If no output_options in param -> check 4900 if not output_options: 4901 output_options = { 4902 "outputContributingVariantsOnly": False, 4903 "numGenes": 0, 4904 "outputFormats": defaut_output_formats, 4905 } 4906 4907 # Replace outputDirectory in output options 4908 output_options["outputDirectory"] = output_results 4909 output_options["outputFileName"] = "howard" 4910 4911 # Add outputOptions in analysis dict 4912 param_exomiser_analysis_dict["outputOptions"] = output_options 4913 4914 else: 4915 4916 # Replace output_results and output format (if exists in param) 4917 param_exomiser_analysis_dict["outputOptions"][ 4918 "outputDirectory" 4919 ] = output_results 4920 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4921 list( 4922 set( 4923 param_exomiser_analysis_dict.get( 4924 "outputOptions", {} 4925 ).get("outputFormats", []) 4926 + ["TSV_VARIANT", "VCF"] 4927 ) 4928 ) 4929 ) 4930 4931 # log 4932 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4933 4934 ### ANALYSIS FILE ### 4935 ##################### 4936 4937 ### Full JSON analysis config file ### 4938 4939 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4940 with open(exomiser_analysis, "w") as fp: 4941 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4942 4943 ### SPLIT analysis and sample config files 4944 4945 # Splitted analysis dict 4946 param_exomiser_analysis_dict_for_split = ( 4947 param_exomiser_analysis_dict.copy() 4948 ) 4949 4950 # Phenopacket JSON file 4951 exomiser_analysis_phenopacket = os.path.join( 4952 tmp_dir, "analysis_phenopacket.json" 4953 ) 4954 with open(exomiser_analysis_phenopacket, "w") as fp: 4955 json.dump( 4956 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4957 fp, 4958 indent=4, 4959 ) 4960 4961 # Analysis JSON file without Phenopacket parameters 4962 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4963 exomiser_analysis_analysis = os.path.join( 4964 tmp_dir, "analysis_analysis.json" 4965 ) 4966 with open(exomiser_analysis_analysis, "w") as fp: 4967 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4968 4969 ### INITAL VCF file ### 4970 ####################### 4971 4972 ### Create list of samples to use and include inti initial VCF file #### 4973 4974 # Subject (main sample) 4975 # Get sample ID in analysis dict 4976 sample_subject = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("subject", {}) 4979 .get("id", None) 4980 ) 4981 sample_proband = ( 4982 param_exomiser_analysis_dict.get("phenopacket", {}) 4983 .get("proband", {}) 4984 .get("subject", {}) 4985 .get("id", None) 4986 ) 4987 sample = [] 4988 if sample_subject: 4989 sample.append(sample_subject) 4990 if sample_proband: 4991 sample.append(sample_proband) 4992 4993 # Get sample ID within Pedigree 4994 pedigree_persons_list = ( 4995 param_exomiser_analysis_dict.get("phenopacket", {}) 4996 .get("pedigree", {}) 4997 .get("persons", {}) 4998 ) 4999 5000 # Create list with all sample ID in pedigree (if exists) 5001 pedigree_persons = [] 5002 for person in pedigree_persons_list: 5003 pedigree_persons.append(person.get("individualId")) 5004 5005 # Concat subject sample ID and samples ID in pedigreesamples 5006 samples = list(set(sample + pedigree_persons)) 5007 5008 # Check if sample list is not empty 5009 if not samples: 5010 log.error(f"No samples found") 5011 raise ValueError(f"No samples found") 5012 5013 # Create VCF with sample (either sample in param or first one by default) 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=True, 5019 list_samples=samples, 5020 index=False, 5021 ) 5022 5023 ### Execute Exomiser ### 5024 ######################## 5025 5026 # Init command 5027 exomiser_command = "" 5028 5029 # Command exomiser options 5030 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5031 5032 # Release 5033 exomiser_release = param_exomiser.get("release", None) 5034 if exomiser_release: 5035 # phenotype data version 5036 exomiser_options += ( 5037 f" --exomiser.phenotype.data-version={exomiser_release} " 5038 ) 5039 # data version 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.data-version={exomiser_release} " 5042 ) 5043 # variant white list 5044 variant_white_list_file = ( 5045 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5046 ) 5047 if os.path.exists( 5048 os.path.join( 5049 databases_folders, assembly, variant_white_list_file 5050 ) 5051 ): 5052 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5053 5054 # transcript_source 5055 transcript_source = param_exomiser.get( 5056 "transcript_source", None 5057 ) # ucsc, refseq, ensembl 5058 if transcript_source: 5059 exomiser_options += ( 5060 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5061 ) 5062 5063 # If analysis contain proband param 5064 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5065 "proband", {} 5066 ): 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5068 5069 # If no proband (usually uniq sample) 5070 else: 5071 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5072 5073 # Log 5074 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5075 5076 # Run command 5077 result = subprocess.call( 5078 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5079 ) 5080 if result: 5081 log.error("Exomiser command failed") 5082 raise ValueError("Exomiser command failed") 5083 5084 ### RESULTS ### 5085 ############### 5086 5087 ### Annotate with TSV fields ### 5088 5089 # Init result tsv file 5090 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5091 5092 # Init result tsv file 5093 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5094 5095 # Parse TSV file and explode columns in INFO field 5096 if exomiser_to_info and os.path.exists(output_results_tsv): 5097 5098 # Log 5099 log.debug("Exomiser columns to VCF INFO field") 5100 5101 # Retrieve columns and types 5102 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5103 output_results_tsv_df = self.get_query_to_df(query) 5104 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5105 5106 # Init concat fields for update 5107 sql_query_update_concat_fields = [] 5108 5109 # Fields to avoid 5110 fields_to_avoid = [ 5111 "CONTIG", 5112 "START", 5113 "END", 5114 "REF", 5115 "ALT", 5116 "QUAL", 5117 "FILTER", 5118 "GENOTYPE", 5119 ] 5120 5121 # List all columns to add into header 5122 for header_column in output_results_tsv_columns: 5123 5124 # If header column is enable 5125 if header_column not in fields_to_avoid: 5126 5127 # Header info type 5128 header_info_type = "String" 5129 header_column_df = output_results_tsv_df[header_column] 5130 header_column_df_dtype = header_column_df.dtype 5131 if header_column_df_dtype == object: 5132 if ( 5133 pd.to_numeric(header_column_df, errors="coerce") 5134 .notnull() 5135 .all() 5136 ): 5137 header_info_type = "Float" 5138 else: 5139 header_info_type = "Integer" 5140 5141 # Header info 5142 characters_to_validate = ["-"] 5143 pattern = "[" + "".join(characters_to_validate) + "]" 5144 header_info_name = re.sub( 5145 pattern, 5146 "_", 5147 f"Exomiser_{header_column}".replace("#", ""), 5148 ) 5149 header_info_number = "." 5150 header_info_description = ( 5151 f"Exomiser {header_column} annotation" 5152 ) 5153 header_info_source = "Exomiser" 5154 header_info_version = "unknown" 5155 header_info_code = CODE_TYPE_MAP[header_info_type] 5156 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5157 header_info_name, 5158 header_info_number, 5159 header_info_type, 5160 header_info_description, 5161 header_info_source, 5162 header_info_version, 5163 header_info_code, 5164 ) 5165 5166 # Add field to add for update to concat fields 5167 sql_query_update_concat_fields.append( 5168 f""" 5169 CASE 5170 WHEN table_parquet."{header_column}" NOT IN ('','.') 5171 THEN concat( 5172 '{header_info_name}=', 5173 table_parquet."{header_column}", 5174 ';' 5175 ) 5176 5177 ELSE '' 5178 END 5179 """ 5180 ) 5181 5182 # Update query 5183 sql_query_update = f""" 5184 UPDATE {table_variants} as table_variants 5185 SET INFO = concat( 5186 CASE 5187 WHEN INFO NOT IN ('', '.') 5188 THEN INFO 5189 ELSE '' 5190 END, 5191 CASE 5192 WHEN table_variants.INFO NOT IN ('','.') 5193 THEN ';' 5194 ELSE '' 5195 END, 5196 ( 5197 SELECT 5198 concat( 5199 {",".join(sql_query_update_concat_fields)} 5200 ) 5201 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5202 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5203 AND table_parquet.\"START\" = table_variants.\"POS\" 5204 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5205 AND table_parquet.\"REF\" = table_variants.\"REF\" 5206 ) 5207 ) 5208 ; 5209 """ 5210 5211 # Update 5212 self.conn.execute(sql_query_update) 5213 5214 ### Annotate with VCF INFO field ### 5215 5216 # Init result VCF file 5217 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5218 5219 # If VCF exists 5220 if os.path.exists(output_results_vcf): 5221 5222 # Log 5223 log.debug("Exomiser result VCF update variants") 5224 5225 # Find Exomiser INFO field annotation in header 5226 with gzip.open(output_results_vcf, "rt") as f: 5227 header_list = self.read_vcf_header(f) 5228 exomiser_vcf_header = vcf.Reader( 5229 io.StringIO("\n".join(header_list)) 5230 ) 5231 5232 # Add annotation INFO field to header 5233 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5234 5235 # Update variants with VCF 5236 self.update_from_vcf(output_results_vcf) 5237 5238 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5240 def annotation_snpeff(self, threads: int = None) -> None: 5241 """ 5242 This function annotate with snpEff 5243 5244 :param threads: The number of threads to use 5245 :return: the value of the variable "return_value". 5246 """ 5247 5248 # DEBUG 5249 log.debug("Start annotation with snpeff databases") 5250 5251 # Threads 5252 if not threads: 5253 threads = self.get_threads() 5254 log.debug("Threads: " + str(threads)) 5255 5256 # DEBUG 5257 delete_tmp = True 5258 if self.get_config().get("verbosity", "warning") in ["debug"]: 5259 delete_tmp = False 5260 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5261 5262 # Config 5263 config = self.get_config() 5264 log.debug("Config: " + str(config)) 5265 5266 # Config - Folders - Databases 5267 databases_folders = ( 5268 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5269 ) 5270 log.debug("Databases annotations: " + str(databases_folders)) 5271 5272 # Config - snpEff bin command 5273 snpeff_bin_command = get_bin_command( 5274 bin="snpEff.jar", 5275 tool="snpeff", 5276 bin_type="jar", 5277 config=config, 5278 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5279 ) 5280 if not snpeff_bin_command: 5281 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5282 log.error(msg_err) 5283 raise ValueError(msg_err) 5284 5285 # Config - snpEff databases 5286 snpeff_databases = ( 5287 config.get("folders", {}) 5288 .get("databases", {}) 5289 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5290 ) 5291 snpeff_databases = full_path(snpeff_databases) 5292 if snpeff_databases is not None and snpeff_databases != "": 5293 log.debug(f"Create snpEff databases folder") 5294 if not os.path.exists(snpeff_databases): 5295 os.makedirs(snpeff_databases) 5296 5297 # Param 5298 param = self.get_param() 5299 log.debug("Param: " + str(param)) 5300 5301 # Param 5302 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5303 log.debug("Options: " + str(options)) 5304 5305 # Param - Assembly 5306 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5307 5308 # Param - Options 5309 snpeff_options = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5311 ) 5312 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5313 snpeff_csvstats = ( 5314 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5315 ) 5316 if snpeff_stats: 5317 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5318 snpeff_stats = full_path(snpeff_stats) 5319 snpeff_options += f" -stats {snpeff_stats}" 5320 if snpeff_csvstats: 5321 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5322 snpeff_csvstats = full_path(snpeff_csvstats) 5323 snpeff_options += f" -csvStats {snpeff_csvstats}" 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes = ( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5332 ) 5333 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5334 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # Export in VCF 5339 log.debug("Create initial file to annotate") 5340 tmp_vcf = NamedTemporaryFile( 5341 prefix=self.get_prefix(), 5342 dir=self.get_tmp_dir(), 5343 suffix=".vcf.gz", 5344 delete=True, 5345 ) 5346 tmp_vcf_name = tmp_vcf.name 5347 5348 # VCF header 5349 vcf_reader = self.get_header() 5350 log.debug("Initial header: " + str(vcf_reader.infos)) 5351 5352 # Existing annotations 5353 for vcf_annotation in self.get_header().infos: 5354 5355 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5356 log.debug( 5357 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5358 ) 5359 5360 # Memory limit 5361 # if config.get("memory", None): 5362 # memory_limit = config.get("memory", "8G") 5363 # else: 5364 # memory_limit = "8G" 5365 memory_limit = self.get_memory("8G") 5366 log.debug(f"memory_limit: {memory_limit}") 5367 5368 # snpEff java options 5369 snpeff_java_options = ( 5370 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5371 ) 5372 log.debug(f"Exomiser java options: {snpeff_java_options}") 5373 5374 force_update_annotation = True 5375 5376 if "ANN" not in self.get_header().infos or force_update_annotation: 5377 5378 # Check snpEff database 5379 log.debug(f"Check snpEff databases {[assembly]}") 5380 databases_download_snpeff( 5381 folder=snpeff_databases, assemblies=[assembly], config=config 5382 ) 5383 5384 # Export VCF file 5385 self.export_variant_vcf( 5386 vcf_file=tmp_vcf_name, 5387 remove_info=True, 5388 add_samples=False, 5389 index=True, 5390 ) 5391 5392 # Tmp file 5393 err_files = [] 5394 tmp_annotate_vcf = NamedTemporaryFile( 5395 prefix=self.get_prefix(), 5396 dir=self.get_tmp_dir(), 5397 suffix=".vcf", 5398 delete=False, 5399 ) 5400 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5401 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5402 err_files.append(tmp_annotate_vcf_name_err) 5403 5404 # Command 5405 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5406 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5407 run_parallel_commands([snpeff_command], 1) 5408 5409 # Error messages 5410 log.info(f"Error/Warning messages:") 5411 error_message_command_all = [] 5412 error_message_command_warning = [] 5413 error_message_command_err = [] 5414 for err_file in err_files: 5415 with open(err_file, "r") as f: 5416 for line in f: 5417 message = line.strip() 5418 error_message_command_all.append(message) 5419 if line.startswith("[W::"): 5420 error_message_command_warning.append(message) 5421 if line.startswith("[E::"): 5422 error_message_command_err.append(f"{err_file}: " + message) 5423 # log info 5424 for message in list( 5425 set(error_message_command_err + error_message_command_warning) 5426 ): 5427 log.info(f" {message}") 5428 # debug info 5429 for message in list(set(error_message_command_all)): 5430 log.debug(f" {message}") 5431 # failed 5432 if len(error_message_command_err): 5433 log.error("Annotation failed: Error in commands") 5434 raise ValueError("Annotation failed: Error in commands") 5435 5436 # Find annotation in header 5437 with open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 else: 5450 if "ANN" in self.get_header().infos: 5451 log.debug(f"Existing snpEff annotations in VCF") 5452 if force_update_annotation: 5453 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5455 def annotation_annovar(self, threads: int = None) -> None: 5456 """ 5457 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5458 annotations 5459 5460 :param threads: number of threads to use 5461 :return: the value of the variable "return_value". 5462 """ 5463 5464 # DEBUG 5465 log.debug("Start annotation with Annovar databases") 5466 5467 # Threads 5468 if not threads: 5469 threads = self.get_threads() 5470 log.debug("Threads: " + str(threads)) 5471 5472 # Tmp en Err files 5473 tmp_files = [] 5474 err_files = [] 5475 5476 # DEBUG 5477 delete_tmp = True 5478 if self.get_config().get("verbosity", "warning") in ["debug"]: 5479 delete_tmp = False 5480 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5481 5482 # Config 5483 config = self.get_config() 5484 log.debug("Config: " + str(config)) 5485 5486 # Config - Folders - Databases 5487 databases_folders = ( 5488 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5489 ) 5490 log.debug("Databases annotations: " + str(databases_folders)) 5491 5492 # Config - annovar bin command 5493 annovar_bin_command = get_bin_command( 5494 bin="table_annovar.pl", 5495 tool="annovar", 5496 bin_type="perl", 5497 config=config, 5498 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5499 ) 5500 if not annovar_bin_command: 5501 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5502 log.error(msg_err) 5503 raise ValueError(msg_err) 5504 5505 # Config - BCFTools bin command 5506 bcftools_bin_command = get_bin_command( 5507 bin="bcftools", 5508 tool="bcftools", 5509 bin_type="bin", 5510 config=config, 5511 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5512 ) 5513 if not bcftools_bin_command: 5514 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Config - annovar databases 5519 annovar_databases = ( 5520 config.get("folders", {}) 5521 .get("databases", {}) 5522 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5523 ) 5524 if annovar_databases is not None: 5525 if isinstance(annovar_databases, list): 5526 annovar_databases = full_path(annovar_databases[0]) 5527 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5528 annovar_databases = full_path(annovar_databases) 5529 if not os.path.exists(annovar_databases): 5530 log.info(f"Annovar databases folder '{annovar_databases}' created") 5531 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5532 else: 5533 msg_err = f"Annovar databases configuration failed" 5534 log.error(msg_err) 5535 raise ValueError(msg_err) 5536 5537 # Param 5538 param = self.get_param() 5539 log.debug("Param: " + str(param)) 5540 5541 # Param - options 5542 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5543 log.debug("Options: " + str(options)) 5544 5545 # Param - annotations 5546 annotations = ( 5547 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5548 ) 5549 log.debug("Annotations: " + str(annotations)) 5550 5551 # Param - Assembly 5552 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5553 5554 # Annovar database assembly 5555 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5556 if annovar_databases_assembly != "" and not os.path.exists( 5557 annovar_databases_assembly 5558 ): 5559 os.makedirs(annovar_databases_assembly) 5560 5561 # Data 5562 table_variants = self.get_table_variants() 5563 5564 # Check if not empty 5565 log.debug("Check if not empty") 5566 sql_query_chromosomes = ( 5567 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5568 ) 5569 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5570 if not sql_query_chromosomes_df["count"][0]: 5571 log.info(f"VCF empty") 5572 return 5573 5574 # VCF header 5575 vcf_reader = self.get_header() 5576 log.debug("Initial header: " + str(vcf_reader.infos)) 5577 5578 # Existing annotations 5579 for vcf_annotation in self.get_header().infos: 5580 5581 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5582 log.debug( 5583 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5584 ) 5585 5586 force_update_annotation = True 5587 5588 if annotations: 5589 5590 commands = [] 5591 tmp_annotates_vcf_name_list = [] 5592 5593 # Export in VCF 5594 log.debug("Create initial file to annotate") 5595 tmp_vcf = NamedTemporaryFile( 5596 prefix=self.get_prefix(), 5597 dir=self.get_tmp_dir(), 5598 suffix=".vcf.gz", 5599 delete=False, 5600 ) 5601 tmp_vcf_name = tmp_vcf.name 5602 tmp_files.append(tmp_vcf_name) 5603 tmp_files.append(tmp_vcf_name + ".tbi") 5604 5605 # Export VCF file 5606 self.export_variant_vcf( 5607 vcf_file=tmp_vcf_name, 5608 remove_info=".", 5609 add_samples=False, 5610 index=True, 5611 ) 5612 5613 # Create file for field rename 5614 log.debug("Create file for field rename") 5615 tmp_rename = NamedTemporaryFile( 5616 prefix=self.get_prefix(), 5617 dir=self.get_tmp_dir(), 5618 suffix=".rename", 5619 delete=False, 5620 ) 5621 tmp_rename_name = tmp_rename.name 5622 tmp_files.append(tmp_rename_name) 5623 5624 # Check Annovar database 5625 log.debug( 5626 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5627 ) 5628 databases_download_annovar( 5629 folder=annovar_databases, 5630 files=list(annotations.keys()), 5631 assemblies=[assembly], 5632 ) 5633 5634 for annotation in annotations: 5635 annotation_fields = annotations[annotation] 5636 5637 if not annotation_fields: 5638 annotation_fields = {"INFO": None} 5639 5640 log.info(f"Annotations Annovar - database '{annotation}'") 5641 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5642 5643 # Tmp file for annovar 5644 err_files = [] 5645 tmp_annotate_vcf_directory = TemporaryDirectory( 5646 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5647 ) 5648 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5649 tmp_annotate_vcf_name_annovar = ( 5650 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5651 ) 5652 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5653 err_files.append(tmp_annotate_vcf_name_err) 5654 tmp_files.append(tmp_annotate_vcf_name_err) 5655 5656 # Tmp file final vcf annotated by annovar 5657 tmp_annotate_vcf = NamedTemporaryFile( 5658 prefix=self.get_prefix(), 5659 dir=self.get_tmp_dir(), 5660 suffix=".vcf.gz", 5661 delete=False, 5662 ) 5663 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5664 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name) 5666 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5667 5668 # Number of fields 5669 annotation_list = [] 5670 annotation_renamed_list = [] 5671 5672 for annotation_field in annotation_fields: 5673 5674 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5675 annotation_fields_new_name = annotation_fields.get( 5676 annotation_field, annotation_field 5677 ) 5678 if not annotation_fields_new_name: 5679 annotation_fields_new_name = annotation_field 5680 5681 if ( 5682 force_update_annotation 5683 or annotation_fields_new_name not in self.get_header().infos 5684 ): 5685 annotation_list.append(annotation_field) 5686 annotation_renamed_list.append(annotation_fields_new_name) 5687 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5688 log.warning( 5689 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5690 ) 5691 5692 # Add rename info 5693 run_parallel_commands( 5694 [ 5695 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5696 ], 5697 1, 5698 ) 5699 5700 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5701 log.debug("annotation_list: " + str(annotation_list)) 5702 5703 # protocol 5704 protocol = annotation 5705 5706 # argument 5707 argument = "" 5708 5709 # operation 5710 operation = "f" 5711 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5712 "ensGene" 5713 ): 5714 operation = "g" 5715 if options.get("genebase", None): 5716 argument = f"""'{options.get("genebase","")}'""" 5717 elif annotation in ["cytoBand"]: 5718 operation = "r" 5719 5720 # argument option 5721 argument_option = "" 5722 if argument != "": 5723 argument_option = " --argument " + argument 5724 5725 # command options 5726 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5727 for option in options: 5728 if option not in ["genebase"]: 5729 command_options += f""" --{option}={options[option]}""" 5730 5731 # Command 5732 5733 # Command - Annovar 5734 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5735 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5736 5737 # Command - start pipe 5738 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5741 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5742 5743 # Command - Special characters (refGene annotation) 5744 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5745 5746 # Command - Clean empty fields (with value ".") 5747 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5748 5749 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5750 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5751 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5752 # for ann in annotation_renamed_list: 5753 for ann in annotation_list: 5754 annovar_fields_to_keep.append(f"^INFO/{ann}") 5755 5756 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5757 5758 # Command - indexing 5759 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5760 5761 log.debug(f"Annotation - Annovar command: {command_annovar}") 5762 run_parallel_commands([command_annovar], 1) 5763 5764 # Error messages 5765 log.info(f"Error/Warning messages:") 5766 error_message_command_all = [] 5767 error_message_command_warning = [] 5768 error_message_command_err = [] 5769 for err_file in err_files: 5770 with open(err_file, "r") as f: 5771 for line in f: 5772 message = line.strip() 5773 error_message_command_all.append(message) 5774 if line.startswith("[W::") or line.startswith("WARNING"): 5775 error_message_command_warning.append(message) 5776 if line.startswith("[E::") or line.startswith("ERROR"): 5777 error_message_command_err.append( 5778 f"{err_file}: " + message 5779 ) 5780 # log info 5781 for message in list( 5782 set(error_message_command_err + error_message_command_warning) 5783 ): 5784 log.info(f" {message}") 5785 # debug info 5786 for message in list(set(error_message_command_all)): 5787 log.debug(f" {message}") 5788 # failed 5789 if len(error_message_command_err): 5790 log.error("Annotation failed: Error in commands") 5791 raise ValueError("Annotation failed: Error in commands") 5792 5793 if tmp_annotates_vcf_name_list: 5794 5795 # List of annotated files 5796 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5797 5798 # Tmp file 5799 tmp_annotate_vcf = NamedTemporaryFile( 5800 prefix=self.get_prefix(), 5801 dir=self.get_tmp_dir(), 5802 suffix=".vcf.gz", 5803 delete=False, 5804 ) 5805 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5806 tmp_files.append(tmp_annotate_vcf_name) 5807 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5808 err_files.append(tmp_annotate_vcf_name_err) 5809 tmp_files.append(tmp_annotate_vcf_name_err) 5810 5811 # Command merge 5812 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5813 log.info( 5814 f"Annotation Annovar - Annotation merging " 5815 + str(len(tmp_annotates_vcf_name_list)) 5816 + " annotated files" 5817 ) 5818 log.debug(f"Annotation - merge command: {merge_command}") 5819 run_parallel_commands([merge_command], 1) 5820 5821 # Find annotation in header 5822 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5823 header_list = self.read_vcf_header(f) 5824 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5825 5826 for ann in annovar_vcf_header.infos: 5827 if ann not in self.get_header().infos: 5828 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5829 5830 # Update variants 5831 log.info(f"Annotation Annovar - Updating...") 5832 self.update_from_vcf(tmp_annotate_vcf_name) 5833 5834 # Clean files 5835 # Tmp file remove command 5836 if True: 5837 tmp_files_remove_command = "" 5838 if tmp_files: 5839 tmp_files_remove_command = " ".join(tmp_files) 5840 clean_command = f" rm -f {tmp_files_remove_command} " 5841 log.debug(f"Annotation Annovar - Annotation cleaning ") 5842 log.debug(f"Annotation - cleaning command: {clean_command}") 5843 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5846 def annotation_parquet(self, threads: int = None) -> None: 5847 """ 5848 It takes a VCF file, and annotates it with a parquet file 5849 5850 :param threads: number of threads to use for the annotation 5851 :return: the value of the variable "result". 5852 """ 5853 5854 # DEBUG 5855 log.debug("Start annotation with parquet databases") 5856 5857 # Threads 5858 if not threads: 5859 threads = self.get_threads() 5860 log.debug("Threads: " + str(threads)) 5861 5862 # DEBUG 5863 delete_tmp = True 5864 if self.get_config().get("verbosity", "warning") in ["debug"]: 5865 delete_tmp = False 5866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5867 5868 # Config 5869 databases_folders = set( 5870 self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("annotations", ["."]) 5874 + self.get_config() 5875 .get("folders", {}) 5876 .get("databases", {}) 5877 .get("parquet", ["."]) 5878 ) 5879 log.debug("Databases annotations: " + str(databases_folders)) 5880 5881 # Param 5882 annotations = ( 5883 self.get_param() 5884 .get("annotation", {}) 5885 .get("parquet", {}) 5886 .get("annotations", None) 5887 ) 5888 log.debug("Annotations: " + str(annotations)) 5889 5890 # Assembly 5891 assembly = self.get_param().get( 5892 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5893 ) 5894 5895 # Force Update Annotation 5896 force_update_annotation = ( 5897 self.get_param() 5898 .get("annotation", {}) 5899 .get("options", {}) 5900 .get("annotations_update", False) 5901 ) 5902 log.debug(f"force_update_annotation={force_update_annotation}") 5903 force_append_annotation = ( 5904 self.get_param() 5905 .get("annotation", {}) 5906 .get("options", {}) 5907 .get("annotations_append", False) 5908 ) 5909 log.debug(f"force_append_annotation={force_append_annotation}") 5910 5911 # Data 5912 table_variants = self.get_table_variants() 5913 5914 # Check if not empty 5915 log.debug("Check if not empty") 5916 sql_query_chromosomes_df = self.get_query_to_df( 5917 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5918 ) 5919 if not sql_query_chromosomes_df["count"][0]: 5920 log.info(f"VCF empty") 5921 return 5922 5923 # VCF header 5924 vcf_reader = self.get_header() 5925 log.debug("Initial header: " + str(vcf_reader.infos)) 5926 5927 # Nb Variants POS 5928 log.debug("NB Variants Start") 5929 nb_variants = self.conn.execute( 5930 f"SELECT count(*) AS count FROM variants" 5931 ).fetchdf()["count"][0] 5932 log.debug("NB Variants Stop") 5933 5934 # Existing annotations 5935 for vcf_annotation in self.get_header().infos: 5936 5937 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5938 log.debug( 5939 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5940 ) 5941 5942 # Added columns 5943 added_columns = [] 5944 5945 # drop indexes 5946 log.debug(f"Drop indexes...") 5947 self.drop_indexes() 5948 5949 if annotations: 5950 5951 if "ALL" in annotations: 5952 5953 all_param = annotations.get("ALL", {}) 5954 all_param_formats = all_param.get("formats", None) 5955 all_param_releases = all_param.get("releases", None) 5956 5957 databases_infos_dict = self.scan_databases( 5958 database_formats=all_param_formats, 5959 database_releases=all_param_releases, 5960 ) 5961 for database_infos in databases_infos_dict.keys(): 5962 if database_infos not in annotations: 5963 annotations[database_infos] = {"INFO": None} 5964 5965 for annotation in annotations: 5966 5967 if annotation in ["ALL"]: 5968 continue 5969 5970 # Annotation Name 5971 annotation_name = os.path.basename(annotation) 5972 5973 # Annotation fields 5974 annotation_fields = annotations[annotation] 5975 if not annotation_fields: 5976 annotation_fields = {"INFO": None} 5977 5978 log.debug(f"Annotation '{annotation_name}'") 5979 log.debug( 5980 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5981 ) 5982 5983 # Create Database 5984 database = Database( 5985 database=annotation, 5986 databases_folders=databases_folders, 5987 assembly=assembly, 5988 ) 5989 5990 # Find files 5991 parquet_file = database.get_database() 5992 parquet_hdr_file = database.get_header_file() 5993 parquet_type = database.get_type() 5994 5995 # Check if files exists 5996 if not parquet_file or not parquet_hdr_file: 5997 msg_err_list = [] 5998 if not parquet_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file not found" 6001 ) 6002 if parquet_file and not parquet_hdr_file: 6003 msg_err_list.append( 6004 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6005 ) 6006 6007 log.error(". ".join(msg_err_list)) 6008 raise ValueError(". ".join(msg_err_list)) 6009 else: 6010 # Get parquet connexion 6011 parquet_sql_attach = database.get_sql_database_attach( 6012 output="query" 6013 ) 6014 if parquet_sql_attach: 6015 self.conn.execute(parquet_sql_attach) 6016 parquet_file_link = database.get_sql_database_link() 6017 # Log 6018 log.debug( 6019 f"Annotation '{annotation_name}' - file: " 6020 + str(parquet_file) 6021 + " and " 6022 + str(parquet_hdr_file) 6023 ) 6024 6025 # Database full header columns 6026 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6027 parquet_hdr_file 6028 ) 6029 # Log 6030 log.debug( 6031 "Annotation database header columns : " 6032 + str(parquet_hdr_vcf_header_columns) 6033 ) 6034 6035 # Load header as VCF object 6036 parquet_hdr_vcf_header_infos = database.get_header().infos 6037 # Log 6038 log.debug( 6039 "Annotation database header: " 6040 + str(parquet_hdr_vcf_header_infos) 6041 ) 6042 6043 # Get extra infos 6044 parquet_columns = database.get_extra_columns() 6045 # Log 6046 log.debug("Annotation database Columns: " + str(parquet_columns)) 6047 6048 # Add extra columns if "ALL" in annotation_fields 6049 # if "ALL" in annotation_fields: 6050 # allow_add_extra_column = True 6051 if "ALL" in annotation_fields and database.get_extra_columns(): 6052 for extra_column in database.get_extra_columns(): 6053 if ( 6054 extra_column not in annotation_fields 6055 and extra_column.replace("INFO/", "") 6056 not in parquet_hdr_vcf_header_infos 6057 ): 6058 parquet_hdr_vcf_header_infos[extra_column] = ( 6059 vcf.parser._Info( 6060 extra_column, 6061 ".", 6062 "String", 6063 f"{extra_column} description", 6064 "unknown", 6065 "unknown", 6066 self.code_type_map["String"], 6067 ) 6068 ) 6069 6070 # For all fields in database 6071 annotation_fields_all = False 6072 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6073 annotation_fields_all = True 6074 annotation_fields = { 6075 key: key for key in parquet_hdr_vcf_header_infos 6076 } 6077 6078 log.debug( 6079 "Annotation database header - All annotations added: " 6080 + str(annotation_fields) 6081 ) 6082 6083 # Init 6084 6085 # List of annotation fields to use 6086 sql_query_annotation_update_info_sets = [] 6087 6088 # List of annotation to agregate 6089 sql_query_annotation_to_agregate = [] 6090 6091 # Number of fields 6092 nb_annotation_field = 0 6093 6094 # Annotation fields processed 6095 annotation_fields_processed = [] 6096 6097 # Columns mapping 6098 map_columns = database.map_columns( 6099 columns=annotation_fields, prefixes=["INFO/"] 6100 ) 6101 6102 # Query dict for fields to remove (update option) 6103 query_dict_remove = {} 6104 6105 # Fetch Anotation fields 6106 for annotation_field in annotation_fields: 6107 6108 # annotation_field_column 6109 annotation_field_column = map_columns.get( 6110 annotation_field, "INFO" 6111 ) 6112 6113 # field new name, if parametered 6114 annotation_fields_new_name = annotation_fields.get( 6115 annotation_field, annotation_field 6116 ) 6117 if not annotation_fields_new_name: 6118 annotation_fields_new_name = annotation_field 6119 6120 # To annotate 6121 # force_update_annotation = True 6122 # force_append_annotation = True 6123 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6124 if annotation_field in parquet_hdr_vcf_header_infos and ( 6125 force_update_annotation 6126 or force_append_annotation 6127 or ( 6128 annotation_fields_new_name 6129 not in self.get_header().infos 6130 ) 6131 ): 6132 6133 # Add field to annotation to process list 6134 annotation_fields_processed.append( 6135 annotation_fields_new_name 6136 ) 6137 6138 # explode infos for the field 6139 annotation_fields_new_name_info_msg = "" 6140 if ( 6141 force_update_annotation 6142 and annotation_fields_new_name 6143 in self.get_header().infos 6144 ): 6145 # Remove field from INFO 6146 query = f""" 6147 UPDATE {table_variants} as table_variants 6148 SET INFO = REGEXP_REPLACE( 6149 concat(table_variants.INFO,''), 6150 ';*{annotation_fields_new_name}=[^;]*', 6151 '' 6152 ) 6153 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6154 """ 6155 annotation_fields_new_name_info_msg = " [update]" 6156 query_dict_remove[ 6157 f"remove 'INFO/{annotation_fields_new_name}'" 6158 ] = query 6159 6160 # Sep between fields in INFO 6161 nb_annotation_field += 1 6162 if nb_annotation_field > 1: 6163 annotation_field_sep = ";" 6164 else: 6165 annotation_field_sep = "" 6166 6167 log.info( 6168 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6169 ) 6170 6171 # Add INFO field to header 6172 parquet_hdr_vcf_header_infos_number = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].num 6174 or "." 6175 ) 6176 parquet_hdr_vcf_header_infos_type = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].type 6178 or "String" 6179 ) 6180 parquet_hdr_vcf_header_infos_description = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].desc 6182 or f"{annotation_field} description" 6183 ) 6184 parquet_hdr_vcf_header_infos_source = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].source 6186 or "unknown" 6187 ) 6188 parquet_hdr_vcf_header_infos_version = ( 6189 parquet_hdr_vcf_header_infos[annotation_field].version 6190 or "unknown" 6191 ) 6192 6193 vcf_reader.infos[annotation_fields_new_name] = ( 6194 vcf.parser._Info( 6195 annotation_fields_new_name, 6196 parquet_hdr_vcf_header_infos_number, 6197 parquet_hdr_vcf_header_infos_type, 6198 parquet_hdr_vcf_header_infos_description, 6199 parquet_hdr_vcf_header_infos_source, 6200 parquet_hdr_vcf_header_infos_version, 6201 self.code_type_map[ 6202 parquet_hdr_vcf_header_infos_type 6203 ], 6204 ) 6205 ) 6206 6207 # Append 6208 if force_append_annotation: 6209 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6210 else: 6211 query_case_when_append = "" 6212 6213 # Annotation/Update query fields 6214 # Found in INFO column 6215 if ( 6216 annotation_field_column == "INFO" 6217 and "INFO" in parquet_hdr_vcf_header_columns 6218 ): 6219 sql_query_annotation_update_info_sets.append( 6220 f""" 6221 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6222 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6223 ELSE '' 6224 END 6225 """ 6226 ) 6227 # Found in a specific column 6228 else: 6229 sql_query_annotation_update_info_sets.append( 6230 f""" 6231 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6232 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6233 ELSE '' 6234 END 6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate 6242 else: 6243 6244 if force_update_annotation: 6245 annotation_message = "forced" 6246 else: 6247 annotation_message = "skipped" 6248 6249 if annotation_field not in parquet_hdr_vcf_header_infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6252 ) 6253 if annotation_fields_new_name in self.get_header().infos: 6254 log.warning( 6255 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6256 ) 6257 6258 # Check if ALL fields have to be annotated. Thus concat all INFO field 6259 # allow_annotation_full_info = True 6260 allow_annotation_full_info = not force_append_annotation 6261 6262 if parquet_type in ["regions"]: 6263 allow_annotation_full_info = False 6264 6265 if ( 6266 allow_annotation_full_info 6267 and nb_annotation_field == len(annotation_fields) 6268 and annotation_fields_all 6269 and ( 6270 "INFO" in parquet_hdr_vcf_header_columns 6271 and "INFO" in database.get_extra_columns() 6272 ) 6273 ): 6274 log.debug("Column INFO annotation enabled") 6275 sql_query_annotation_update_info_sets = [] 6276 sql_query_annotation_update_info_sets.append( 6277 f" table_parquet.INFO " 6278 ) 6279 6280 if sql_query_annotation_update_info_sets: 6281 6282 # Annotate 6283 log.info(f"Annotation '{annotation_name}' - Annotation...") 6284 6285 # Join query annotation update info sets for SQL 6286 sql_query_annotation_update_info_sets_sql = ",".join( 6287 sql_query_annotation_update_info_sets 6288 ) 6289 6290 # Check chromosomes list (and variants infos) 6291 sql_query_chromosomes = f""" 6292 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6293 FROM {table_variants} as table_variants 6294 GROUP BY table_variants."#CHROM" 6295 ORDER BY table_variants."#CHROM" 6296 """ 6297 sql_query_chromosomes_df = self.conn.execute( 6298 sql_query_chromosomes 6299 ).df() 6300 sql_query_chromosomes_dict = { 6301 entry["CHROM"]: { 6302 "count": entry["count_variants"], 6303 "min": entry["min_variants"], 6304 "max": entry["max_variants"], 6305 } 6306 for index, entry in sql_query_chromosomes_df.iterrows() 6307 } 6308 6309 # Init 6310 nb_of_query = 0 6311 nb_of_variant_annotated = 0 6312 query_dict = query_dict_remove 6313 6314 # for chrom in sql_query_chromosomes_df["CHROM"]: 6315 for chrom in sql_query_chromosomes_dict: 6316 6317 # Number of variant by chromosome 6318 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6319 chrom, {} 6320 ).get("count", 0) 6321 6322 log.debug( 6323 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6324 ) 6325 6326 # Annotation with regions database 6327 if parquet_type in ["regions"]: 6328 sql_query_annotation_from_clause = f""" 6329 FROM ( 6330 SELECT 6331 '{chrom}' AS \"#CHROM\", 6332 table_variants_from.\"POS\" AS \"POS\", 6333 {",".join(sql_query_annotation_to_agregate)} 6334 FROM {table_variants} as table_variants_from 6335 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6336 table_parquet_from."#CHROM" = '{chrom}' 6337 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6338 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6339 ) 6340 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6341 GROUP BY table_variants_from.\"POS\" 6342 ) 6343 as table_parquet 6344 """ 6345 6346 sql_query_annotation_where_clause = """ 6347 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6348 AND table_parquet.\"POS\" = table_variants.\"POS\" 6349 """ 6350 6351 # Annotation with variants database 6352 else: 6353 sql_query_annotation_from_clause = f""" 6354 FROM {parquet_file_link} as table_parquet 6355 """ 6356 sql_query_annotation_where_clause = f""" 6357 table_variants."#CHROM" = '{chrom}' 6358 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6359 AND table_parquet.\"POS\" = table_variants.\"POS\" 6360 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6361 AND table_parquet.\"REF\" = table_variants.\"REF\" 6362 """ 6363 6364 # Create update query 6365 sql_query_annotation_chrom_interval_pos = f""" 6366 UPDATE {table_variants} as table_variants 6367 SET INFO = 6368 concat( 6369 CASE WHEN table_variants.INFO NOT IN ('','.') 6370 THEN table_variants.INFO 6371 ELSE '' 6372 END 6373 , 6374 CASE WHEN table_variants.INFO NOT IN ('','.') 6375 AND ( 6376 concat({sql_query_annotation_update_info_sets_sql}) 6377 ) 6378 NOT IN ('','.') 6379 THEN ';' 6380 ELSE '' 6381 END 6382 , 6383 {sql_query_annotation_update_info_sets_sql} 6384 ) 6385 {sql_query_annotation_from_clause} 6386 WHERE {sql_query_annotation_where_clause} 6387 ; 6388 """ 6389 6390 # Add update query to dict 6391 query_dict[ 6392 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6393 ] = sql_query_annotation_chrom_interval_pos 6394 6395 nb_of_query = len(query_dict) 6396 num_query = 0 6397 6398 # SET max_expression_depth TO x 6399 self.conn.execute("SET max_expression_depth TO 10000") 6400 6401 for query_name in query_dict: 6402 query = query_dict[query_name] 6403 num_query += 1 6404 log.info( 6405 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6406 ) 6407 result = self.conn.execute(query) 6408 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6409 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6412 ) 6413 6414 log.info( 6415 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6416 ) 6417 6418 else: 6419 6420 log.info( 6421 f"Annotation '{annotation_name}' - No Annotations available" 6422 ) 6423 6424 log.debug("Final header: " + str(vcf_reader.infos)) 6425 6426 # Remove added columns 6427 for added_column in added_columns: 6428 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6430 def annotation_splice(self, threads: int = None) -> None: 6431 """ 6432 This function annotate with snpEff 6433 6434 :param threads: The number of threads to use 6435 :return: the value of the variable "return_value". 6436 """ 6437 6438 # DEBUG 6439 log.debug("Start annotation with splice tools") 6440 6441 # Threads 6442 if not threads: 6443 threads = self.get_threads() 6444 log.debug("Threads: " + str(threads)) 6445 6446 # DEBUG 6447 delete_tmp = True 6448 if self.get_config().get("verbosity", "warning") in ["debug"]: 6449 delete_tmp = False 6450 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6451 6452 # Config 6453 config = self.get_config() 6454 log.debug("Config: " + str(config)) 6455 splice_config = config.get("tools", {}).get("splice", {}) 6456 if not splice_config: 6457 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6458 msg_err = "No Splice tool config" 6459 raise ValueError(msg_err) 6460 log.debug(f"splice_config: {splice_config}") 6461 6462 # Config - Folders - Databases 6463 databases_folders = ( 6464 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6465 ) 6466 log.debug("Databases annotations: " + str(databases_folders)) 6467 6468 # Splice docker image 6469 splice_docker_image = splice_config.get("docker").get("image") 6470 6471 # Pull splice image if it's not already there 6472 if not check_docker_image_exists(splice_docker_image): 6473 log.warning( 6474 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6475 ) 6476 try: 6477 command(f"docker pull {splice_config.get('docker').get('image')}") 6478 except subprocess.CalledProcessError: 6479 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6480 log.error(msg_err) 6481 raise ValueError(msg_err) 6482 6483 # Config - splice databases 6484 splice_databases = ( 6485 config.get("folders", {}) 6486 .get("databases", {}) 6487 .get("splice", DEFAULT_SPLICE_FOLDER) 6488 ) 6489 splice_databases = full_path(splice_databases) 6490 6491 # Param 6492 param = self.get_param() 6493 log.debug("Param: " + str(param)) 6494 6495 # Param 6496 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6497 log.debug("Options: " + str(options)) 6498 6499 # Data 6500 table_variants = self.get_table_variants() 6501 6502 # Check if not empty 6503 log.debug("Check if not empty") 6504 sql_query_chromosomes = ( 6505 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6506 ) 6507 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6508 log.info("VCF empty") 6509 return None 6510 6511 # Export in VCF 6512 log.debug("Create initial file to annotate") 6513 6514 # Create output folder / work folder 6515 if options.get("output_folder", ""): 6516 output_folder = options.get("output_folder", "") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 else: 6520 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6521 if not os.path.exists(output_folder): 6522 Path(output_folder).mkdir(parents=True, exist_ok=True) 6523 6524 if options.get("workdir", ""): 6525 workdir = options.get("workdir", "") 6526 else: 6527 workdir = "/work" 6528 6529 # Create tmp VCF file 6530 tmp_vcf = NamedTemporaryFile( 6531 prefix=self.get_prefix(), 6532 dir=output_folder, 6533 suffix=".vcf", 6534 delete=False, 6535 ) 6536 tmp_vcf_name = tmp_vcf.name 6537 6538 # VCF header 6539 header = self.get_header() 6540 6541 # Existing annotations 6542 for vcf_annotation in self.get_header().infos: 6543 6544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6545 log.debug( 6546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6547 ) 6548 6549 # Memory limit 6550 if config.get("memory", None): 6551 memory_limit = config.get("memory", "8G").upper() 6552 # upper() 6553 else: 6554 memory_limit = "8G" 6555 log.debug(f"memory_limit: {memory_limit}") 6556 6557 # Check number of variants to annotate 6558 where_clause_regex_spliceai = r"SpliceAI_\w+" 6559 where_clause_regex_spip = r"SPiP_\w+" 6560 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6561 df_list_of_variants_to_annotate = self.get_query_to_df( 6562 query=f""" SELECT * FROM variants {where_clause} """ 6563 ) 6564 if len(df_list_of_variants_to_annotate) == 0: 6565 log.warning( 6566 f"No variants to annotate with splice. Variants probably already annotated with splice" 6567 ) 6568 return None 6569 else: 6570 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6571 6572 # Export VCF file 6573 self.export_variant_vcf( 6574 vcf_file=tmp_vcf_name, 6575 remove_info=True, 6576 add_samples=True, 6577 index=False, 6578 where_clause=where_clause, 6579 ) 6580 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6581 if any(value for value in splice_config.values() if value is None): 6582 log.warning("At least one splice config parameter is empty") 6583 # exit annotation_splice 6584 return None 6585 6586 # Params in splice nf 6587 def check_values(dico: dict): 6588 """ 6589 Ensure parameters for NF splice pipeline 6590 """ 6591 for key, val in dico.items(): 6592 if key == "genome": 6593 if any( 6594 assemb in options.get("genome", {}) 6595 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6596 ): 6597 yield f"--{key} hg19" 6598 elif any( 6599 assemb in options.get("genome", {}) 6600 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6601 ): 6602 yield f"--{key} hg38" 6603 elif ( 6604 (isinstance(val, str) and val) 6605 or isinstance(val, int) 6606 or isinstance(val, bool) 6607 ): 6608 yield f"--{key} {val}" 6609 6610 # Genome 6611 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6612 options["genome"] = genome 6613 # NF params 6614 nf_params = [] 6615 # Add options 6616 if options: 6617 log.debug(options) 6618 nf_params = list(check_values(options)) 6619 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6620 else: 6621 log.debug("No NF params provided") 6622 # Add threads 6623 if "threads" not in options.keys(): 6624 nf_params.append(f"--threads {threads}") 6625 # Genome path 6626 genome_path = find_genome( 6627 config.get("folders", {}) 6628 .get("databases", {}) 6629 .get("genomes", DEFAULT_GENOME_FOLDER), 6630 file=f"{genome}.fa", 6631 ) 6632 # Add genome path 6633 if not genome_path: 6634 raise ValueError( 6635 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6636 ) 6637 else: 6638 log.debug(f"Genome: {genome_path}") 6639 nf_params.append(f"--genome_path {genome_path}") 6640 6641 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6642 """ 6643 Setting up updated databases for SPiP and SpliceAI 6644 """ 6645 6646 try: 6647 6648 # SpliceAI assembly transcriptome 6649 spliceai_assembly = os.path.join( 6650 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6651 options.get("genome"), 6652 "transcriptome", 6653 ) 6654 spip_assembly = options.get("genome") 6655 6656 spip = find( 6657 f"transcriptome_{spip_assembly}.RData", 6658 config.get("folders", {}).get("databases", {}).get("spip", {}), 6659 ) 6660 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6661 log.debug(f"SPiP annotations: {spip}") 6662 log.debug(f"SpliceAI annotations: {spliceai}") 6663 if spip and spliceai: 6664 return [ 6665 f"--spip_transcriptome {spip}", 6666 f"--spliceai_transcriptome {spliceai}", 6667 ] 6668 else: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 except TypeError: 6673 log.warning( 6674 "Can't find splice databases in configuration, use annotations file from image" 6675 ) 6676 return [] 6677 6678 # Add options, check if transcriptome option have already beend provided 6679 if ( 6680 "spip_transcriptome" not in nf_params 6681 and "spliceai_transcriptome" not in nf_params 6682 ): 6683 splice_reference = splice_annotations(options, config) 6684 if splice_reference: 6685 nf_params.extend(splice_reference) 6686 # nf_params.append(f"--output_folder {output_folder}") 6687 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6688 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6689 log.debug(cmd) 6690 splice_config["docker"]["command"] = cmd 6691 6692 # Ensure proxy is set 6693 proxy = [ 6694 f"-e {var}={os.getenv(var)}" 6695 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6696 if os.getenv(var) is not None 6697 ] 6698 docker_cmd = get_bin_command( 6699 tool="splice", 6700 bin_type="docker", 6701 config=config, 6702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6703 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6704 ) 6705 # print(docker_cmd) 6706 # exit() 6707 # Docker debug 6708 # if splice_config.get("rm_container"): 6709 # rm_container = "--rm" 6710 # else: 6711 # rm_container = "" 6712 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6713 log.debug(docker_cmd) 6714 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6715 log.debug(res.stdout) 6716 if res.stderr: 6717 log.error(res.stderr) 6718 res.check_returncode() 6719 # Update variants 6720 log.info("Annotation - Updating...") 6721 # Test find output vcf 6722 log.debug( 6723 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6724 ) 6725 output_vcf = [] 6726 # Wrong folder to look in 6727 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6728 if ( 6729 files 6730 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6731 ): 6732 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6733 # log.debug(os.listdir(options.get("output_folder"))) 6734 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6735 if not output_vcf: 6736 log.debug( 6737 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6738 ) 6739 else: 6740 # Get new header from annotated vcf 6741 log.debug(f"Initial header: {len(header.infos)} fields") 6742 # Create new header with splice infos 6743 new_vcf = Variants(input=output_vcf[0]) 6744 new_vcf_header = new_vcf.get_header().infos 6745 for keys, infos in new_vcf_header.items(): 6746 if keys not in header.infos.keys(): 6747 header.infos[keys] = infos 6748 log.debug(f"New header: {len(header.infos)} fields") 6749 log.debug(f"Splice tmp output: {output_vcf[0]}") 6750 self.update_from_vcf(output_vcf[0]) 6751 6752 # Remove file 6753 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6759 def get_config_default(self, name: str) -> dict: 6760 """ 6761 The function `get_config_default` returns a dictionary containing default configurations for 6762 various calculations and prioritizations. 6763 6764 :param name: The `get_config_default` function returns a dictionary containing default 6765 configurations for different calculations and prioritizations. The `name` parameter is used to 6766 specify which specific configuration to retrieve from the dictionary 6767 :type name: str 6768 :return: The function `get_config_default` returns a dictionary containing default configuration 6769 settings for different calculations and prioritizations. The specific configuration settings are 6770 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6771 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6772 returned. If there is no match, an empty dictionary is returned. 6773 """ 6774 6775 config_default = { 6776 "calculations": { 6777 "variant_chr_pos_alt_ref": { 6778 "type": "sql", 6779 "name": "variant_chr_pos_alt_ref", 6780 "description": "Create a variant ID with chromosome, position, alt and ref", 6781 "available": False, 6782 "output_column_name": "variant_chr_pos_alt_ref", 6783 "output_column_type": "String", 6784 "output_column_description": "variant ID with chromosome, position, alt and ref", 6785 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6786 "operation_info": True, 6787 }, 6788 "VARTYPE": { 6789 "type": "sql", 6790 "name": "VARTYPE", 6791 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6792 "available": True, 6793 "table": "variants", 6794 "output_column_name": "VARTYPE", 6795 "output_column_type": "String", 6796 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6797 "operation_query": """ 6798 CASE 6799 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6800 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6801 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6802 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6803 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6804 ELSE 'UNDEFINED' 6805 END 6806 """, 6807 "info_fields": ["SVTYPE"], 6808 "operation_info": True, 6809 }, 6810 "snpeff_hgvs": { 6811 "type": "python", 6812 "name": "snpeff_hgvs", 6813 "description": "HGVS nomenclatures from snpEff annotation", 6814 "available": True, 6815 "function_name": "calculation_extract_snpeff_hgvs", 6816 "function_params": ["snpeff_hgvs", "ANN"], 6817 }, 6818 "snpeff_ann_explode": { 6819 "type": "python", 6820 "name": "snpeff_ann_explode", 6821 "description": "Explode snpEff annotations with uniquify values", 6822 "available": True, 6823 "function_name": "calculation_snpeff_ann_explode", 6824 "function_params": [False, "fields", "snpeff_", "ANN"], 6825 }, 6826 "snpeff_ann_explode_uniquify": { 6827 "type": "python", 6828 "name": "snpeff_ann_explode_uniquify", 6829 "description": "Explode snpEff annotations", 6830 "available": True, 6831 "function_name": "calculation_snpeff_ann_explode", 6832 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6833 }, 6834 "snpeff_ann_explode_json": { 6835 "type": "python", 6836 "name": "snpeff_ann_explode_json", 6837 "description": "Explode snpEff annotations in JSON format", 6838 "available": True, 6839 "function_name": "calculation_snpeff_ann_explode", 6840 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6841 }, 6842 "NOMEN": { 6843 "type": "python", 6844 "name": "NOMEN", 6845 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6846 "available": True, 6847 "function_name": "calculation_extract_nomen", 6848 "function_params": [], 6849 }, 6850 "RENAME_INFO_FIELDS": { 6851 "type": "python", 6852 "name": "RENAME_INFO_FIELDS", 6853 "description": "Rename or remove INFO/tags", 6854 "available": True, 6855 "function_name": "calculation_rename_info_fields", 6856 "function_params": [], 6857 }, 6858 "FINDBYPIPELINE": { 6859 "type": "python", 6860 "name": "FINDBYPIPELINE", 6861 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6862 "available": True, 6863 "function_name": "calculation_find_by_pipeline", 6864 "function_params": ["findbypipeline"], 6865 }, 6866 "FINDBYSAMPLE": { 6867 "type": "python", 6868 "name": "FINDBYSAMPLE", 6869 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6870 "available": True, 6871 "function_name": "calculation_find_by_pipeline", 6872 "function_params": ["findbysample"], 6873 }, 6874 "GENOTYPECONCORDANCE": { 6875 "type": "python", 6876 "name": "GENOTYPECONCORDANCE", 6877 "description": "Concordance of genotype for multi caller VCF", 6878 "available": True, 6879 "function_name": "calculation_genotype_concordance", 6880 "function_params": [], 6881 }, 6882 "BARCODE": { 6883 "type": "python", 6884 "name": "BARCODE", 6885 "description": "BARCODE as VaRank tool", 6886 "available": True, 6887 "function_name": "calculation_barcode", 6888 "function_params": [], 6889 }, 6890 "BARCODEFAMILY": { 6891 "type": "python", 6892 "name": "BARCODEFAMILY", 6893 "description": "BARCODEFAMILY as VaRank tool", 6894 "available": True, 6895 "function_name": "calculation_barcode_family", 6896 "function_params": ["BCF"], 6897 }, 6898 "TRIO": { 6899 "type": "python", 6900 "name": "TRIO", 6901 "description": "Inheritance for a trio family", 6902 "available": True, 6903 "function_name": "calculation_trio", 6904 "function_params": [], 6905 }, 6906 "VAF": { 6907 "type": "python", 6908 "name": "VAF", 6909 "description": "Variant Allele Frequency (VAF) harmonization", 6910 "available": True, 6911 "function_name": "calculation_vaf_normalization", 6912 "function_params": [], 6913 }, 6914 "VAF_stats": { 6915 "type": "python", 6916 "name": "VAF_stats", 6917 "description": "Variant Allele Frequency (VAF) statistics", 6918 "available": True, 6919 "function_name": "calculation_genotype_stats", 6920 "function_params": ["VAF"], 6921 }, 6922 "DP_stats": { 6923 "type": "python", 6924 "name": "DP_stats", 6925 "description": "Depth (DP) statistics", 6926 "available": True, 6927 "function_name": "calculation_genotype_stats", 6928 "function_params": ["DP"], 6929 }, 6930 "variant_id": { 6931 "type": "python", 6932 "name": "variant_id", 6933 "description": "Variant ID generated from variant position and type", 6934 "available": True, 6935 "function_name": "calculation_variant_id", 6936 "function_params": [], 6937 }, 6938 "transcripts_json": { 6939 "type": "python", 6940 "name": "transcripts_json", 6941 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6942 "available": True, 6943 "function_name": "calculation_transcripts_annotation", 6944 "function_params": ["transcripts_json", None], 6945 }, 6946 "transcripts_ann": { 6947 "type": "python", 6948 "name": "transcripts_ann", 6949 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6950 "available": True, 6951 "function_name": "calculation_transcripts_annotation", 6952 "function_params": [None, "transcripts_ann"], 6953 }, 6954 "transcripts_annotations": { 6955 "type": "python", 6956 "name": "transcripts_annotations", 6957 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6958 "available": True, 6959 "function_name": "calculation_transcripts_annotation", 6960 "function_params": [None, None], 6961 }, 6962 "transcripts_prioritization": { 6963 "type": "python", 6964 "name": "transcripts_prioritization", 6965 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6966 "available": True, 6967 "function_name": "calculation_transcripts_prioritization", 6968 "function_params": [], 6969 }, 6970 "transcripts_export": { 6971 "type": "python", 6972 "name": "transcripts_export", 6973 "description": "Export transcripts table/view as a file (using param.json)", 6974 "available": True, 6975 "function_name": "calculation_transcripts_export", 6976 "function_params": [], 6977 }, 6978 }, 6979 "prioritizations": { 6980 "default": { 6981 "ANN2": [ 6982 { 6983 "type": "contains", 6984 "value": "HIGH", 6985 "score": 5, 6986 "flag": "PASS", 6987 "comment": [ 6988 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6989 ], 6990 }, 6991 { 6992 "type": "contains", 6993 "value": "MODERATE", 6994 "score": 3, 6995 "flag": "PASS", 6996 "comment": [ 6997 "A non-disruptive variant that might change protein effectiveness" 6998 ], 6999 }, 7000 { 7001 "type": "contains", 7002 "value": "LOW", 7003 "score": 0, 7004 "flag": "FILTERED", 7005 "comment": [ 7006 "Assumed to be mostly harmless or unlikely to change protein behavior" 7007 ], 7008 }, 7009 { 7010 "type": "contains", 7011 "value": "MODIFIER", 7012 "score": 0, 7013 "flag": "FILTERED", 7014 "comment": [ 7015 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7016 ], 7017 }, 7018 ], 7019 } 7020 }, 7021 } 7022 7023 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
7025 def get_config_json( 7026 self, name: str, config_dict: dict = {}, config_file: str = None 7027 ) -> dict: 7028 """ 7029 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7030 default values, a dictionary, and a file. 7031 7032 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7033 the name of the configuration. It is used to identify and retrieve the configuration settings 7034 for a specific component or module 7035 :type name: str 7036 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7037 dictionary that allows you to provide additional configuration settings or overrides. When you 7038 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7039 the key is the configuration setting you want to override or 7040 :type config_dict: dict 7041 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7042 specify the path to a configuration file that contains additional settings. If provided, the 7043 function will read the contents of this file and update the configuration dictionary with the 7044 values found in the file, overriding any existing values with the 7045 :type config_file: str 7046 :return: The function `get_config_json` returns a dictionary containing the configuration 7047 settings. 7048 """ 7049 7050 # Create with default prioritizations 7051 config_default = self.get_config_default(name=name) 7052 configuration = config_default 7053 # log.debug(f"configuration={configuration}") 7054 7055 # Replace prioritizations from dict 7056 for config in config_dict: 7057 configuration[config] = config_dict[config] 7058 7059 # Replace prioritizations from file 7060 config_file = full_path(config_file) 7061 if config_file: 7062 if os.path.exists(config_file): 7063 with open(config_file) as config_file_content: 7064 config_file_dict = yaml.safe_load(config_file_content) 7065 for config in config_file_dict: 7066 configuration[config] = config_file_dict[config] 7067 else: 7068 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7069 log.error(msg_error) 7070 raise ValueError(msg_error) 7071 7072 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
7074 def prioritization( 7075 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7076 ) -> bool: 7077 """ 7078 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7079 prioritizes variants based on configured profiles and criteria. 7080 7081 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7082 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7083 a table name is provided, the method will prioritize the variants in that specific table 7084 :type table: str 7085 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7086 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7087 provided, the code will use a default prefix value of "PZ" 7088 :type pz_prefix: str 7089 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7090 additional parameters specific to the prioritization process. These parameters can include 7091 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7092 configurations needed for the prioritization of variants in a V 7093 :type pz_param: dict 7094 :return: A boolean value (True) is being returned from the `prioritization` function. 7095 """ 7096 7097 # Config 7098 config = self.get_config() 7099 7100 # Param 7101 param = self.get_param() 7102 7103 # Prioritization param 7104 if pz_param is not None: 7105 prioritization_param = pz_param 7106 else: 7107 prioritization_param = param.get("prioritization", {}) 7108 7109 # Configuration profiles 7110 prioritization_config_file = prioritization_param.get( 7111 "prioritization_config", None 7112 ) 7113 prioritization_config_file = full_path(prioritization_config_file) 7114 prioritizations_config = self.get_config_json( 7115 name="prioritizations", config_file=prioritization_config_file 7116 ) 7117 7118 # Prioritization prefix 7119 pz_prefix_default = "PZ" 7120 if pz_prefix is None: 7121 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7122 7123 # Prioritization options 7124 profiles = prioritization_param.get("profiles", []) 7125 if isinstance(profiles, str): 7126 profiles = profiles.split(",") 7127 pzfields = prioritization_param.get( 7128 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7129 ) 7130 if isinstance(pzfields, str): 7131 pzfields = pzfields.split(",") 7132 default_profile = prioritization_param.get("default_profile", None) 7133 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7134 prioritization_score_mode = prioritization_param.get( 7135 "prioritization_score_mode", "HOWARD" 7136 ) 7137 7138 # Quick Prioritizations 7139 prioritizations = param.get("prioritizations", None) 7140 if prioritizations: 7141 log.info("Quick Prioritization:") 7142 for profile in prioritizations.split(","): 7143 if profile not in profiles: 7144 profiles.append(profile) 7145 log.info(f" {profile}") 7146 7147 # If profile "ALL" provided, all profiles in the config profiles 7148 if "ALL" in profiles: 7149 profiles = list(prioritizations_config.keys()) 7150 7151 for profile in profiles: 7152 if prioritizations_config.get(profile, None): 7153 log.debug(f"Profile '{profile}' configured") 7154 else: 7155 msg_error = f"Profile '{profile}' NOT configured" 7156 log.error(msg_error) 7157 raise ValueError(msg_error) 7158 7159 if profiles: 7160 log.info(f"Prioritization... ") 7161 else: 7162 log.debug(f"No profile defined") 7163 return False 7164 7165 if not default_profile and len(profiles): 7166 default_profile = profiles[0] 7167 7168 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7169 log.debug("Profiles to check: " + str(list(profiles))) 7170 7171 # Variables 7172 if table is not None: 7173 table_variants = table 7174 else: 7175 table_variants = self.get_table_variants(clause="update") 7176 log.debug(f"Table to prioritize: {table_variants}") 7177 7178 # Added columns 7179 added_columns = [] 7180 7181 # Create list of PZfields 7182 # List of PZFields 7183 list_of_pzfields_original = pzfields + [ 7184 pzfield + pzfields_sep + profile 7185 for pzfield in pzfields 7186 for profile in profiles 7187 ] 7188 list_of_pzfields = [] 7189 log.debug(f"{list_of_pzfields_original}") 7190 7191 # Remove existing PZfields to use if exists 7192 for pzfield in list_of_pzfields_original: 7193 if self.get_header().infos.get(pzfield, None) is None: 7194 list_of_pzfields.append(pzfield) 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7196 else: 7197 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7198 7199 if list_of_pzfields: 7200 7201 # Explode Infos prefix 7202 explode_infos_prefix = self.get_explode_infos_prefix() 7203 7204 # PZfields tags description 7205 PZfields_INFOS = { 7206 f"{pz_prefix}Tags": { 7207 "ID": f"{pz_prefix}Tags", 7208 "Number": ".", 7209 "Type": "String", 7210 "Description": "Variant tags based on annotation criteria", 7211 }, 7212 f"{pz_prefix}Score": { 7213 "ID": f"{pz_prefix}Score", 7214 "Number": 1, 7215 "Type": "Integer", 7216 "Description": "Variant score based on annotation criteria", 7217 }, 7218 f"{pz_prefix}Flag": { 7219 "ID": f"{pz_prefix}Flag", 7220 "Number": 1, 7221 "Type": "String", 7222 "Description": "Variant flag based on annotation criteria", 7223 }, 7224 f"{pz_prefix}Comment": { 7225 "ID": f"{pz_prefix}Comment", 7226 "Number": ".", 7227 "Type": "String", 7228 "Description": "Variant comment based on annotation criteria", 7229 }, 7230 f"{pz_prefix}Infos": { 7231 "ID": f"{pz_prefix}Infos", 7232 "Number": ".", 7233 "Type": "String", 7234 "Description": "Variant infos based on annotation criteria", 7235 }, 7236 f"{pz_prefix}Class": { 7237 "ID": f"{pz_prefix}Class", 7238 "Number": ".", 7239 "Type": "String", 7240 "Description": "Variant class based on annotation criteria", 7241 }, 7242 } 7243 7244 # Create INFO fields if not exist 7245 for field in PZfields_INFOS: 7246 field_ID = PZfields_INFOS[field]["ID"] 7247 field_description = PZfields_INFOS[field]["Description"] 7248 if field_ID not in self.get_header().infos and field_ID in pzfields: 7249 field_description = ( 7250 PZfields_INFOS[field]["Description"] 7251 + f", profile {default_profile}" 7252 ) 7253 self.get_header().infos[field_ID] = vcf.parser._Info( 7254 field_ID, 7255 PZfields_INFOS[field]["Number"], 7256 PZfields_INFOS[field]["Type"], 7257 field_description, 7258 "unknown", 7259 "unknown", 7260 code_type_map[PZfields_INFOS[field]["Type"]], 7261 ) 7262 7263 # Create INFO fields if not exist for each profile 7264 for profile in prioritizations_config: 7265 if profile in profiles or profiles == []: 7266 for field in PZfields_INFOS: 7267 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7268 field_description = ( 7269 PZfields_INFOS[field]["Description"] 7270 + f", profile {profile}" 7271 ) 7272 if ( 7273 field_ID not in self.get_header().infos 7274 and field in pzfields 7275 ): 7276 self.get_header().infos[field_ID] = vcf.parser._Info( 7277 field_ID, 7278 PZfields_INFOS[field]["Number"], 7279 PZfields_INFOS[field]["Type"], 7280 field_description, 7281 "unknown", 7282 "unknown", 7283 code_type_map[PZfields_INFOS[field]["Type"]], 7284 ) 7285 7286 # Header 7287 for pzfield in list_of_pzfields: 7288 if re.match(f"{pz_prefix}Score.*", pzfield): 7289 added_column = self.add_column( 7290 table_name=table_variants, 7291 column_name=pzfield, 7292 column_type="INTEGER", 7293 default_value="0", 7294 ) 7295 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7296 added_column = self.add_column( 7297 table_name=table_variants, 7298 column_name=pzfield, 7299 column_type="BOOLEAN", 7300 default_value="1", 7301 ) 7302 elif re.match(f"{pz_prefix}Class.*", pzfield): 7303 added_column = self.add_column( 7304 table_name=table_variants, 7305 column_name=pzfield, 7306 column_type="VARCHAR[]", 7307 default_value="null", 7308 ) 7309 else: 7310 added_column = self.add_column( 7311 table_name=table_variants, 7312 column_name=pzfield, 7313 column_type="STRING", 7314 default_value="''", 7315 ) 7316 added_columns.append(added_column) 7317 7318 # Profiles 7319 if profiles: 7320 7321 # foreach profile in configuration file 7322 for profile in prioritizations_config: 7323 7324 # If profile is asked in param, or ALL are asked (empty profile []) 7325 if profile in profiles or profiles == []: 7326 log.info(f"Profile '{profile}'") 7327 7328 sql_set_info_option = "" 7329 7330 sql_set_info = [] 7331 7332 # PZ fields set 7333 7334 # PZScore 7335 if ( 7336 f"{pz_prefix}Score{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Score{pzfields_sep}{profile}=', 7343 {pz_prefix}Score{pzfields_sep}{profile} 7344 ) 7345 """ 7346 ) 7347 if ( 7348 profile == default_profile 7349 and f"{pz_prefix}Score" in list_of_pzfields 7350 ): 7351 sql_set_info.append( 7352 f""" 7353 concat( 7354 '{pz_prefix}Score=', 7355 {pz_prefix}Score{pzfields_sep}{profile} 7356 ) 7357 """ 7358 ) 7359 7360 # PZFlag 7361 if ( 7362 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 sql_set_info.append( 7366 f""" 7367 concat( 7368 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7369 CASE 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7371 THEN 'PASS' 7372 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7373 THEN 'FILTERED' 7374 END 7375 ) 7376 """ 7377 ) 7378 if ( 7379 profile == default_profile 7380 and f"{pz_prefix}Flag" in list_of_pzfields 7381 ): 7382 sql_set_info.append( 7383 f""" 7384 concat( 7385 '{pz_prefix}Flag=', 7386 CASE 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7388 THEN 'PASS' 7389 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7390 THEN 'FILTERED' 7391 END 7392 ) 7393 """ 7394 ) 7395 7396 # PZClass 7397 if ( 7398 f"{pz_prefix}Class{pzfields_sep}{profile}" 7399 in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 concat( 7404 '{pz_prefix}Class{pzfields_sep}{profile}=', 7405 CASE 7406 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7407 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7408 ELSE '.' 7409 END 7410 ) 7411 7412 """ 7413 ) 7414 if ( 7415 profile == default_profile 7416 and f"{pz_prefix}Class" in list_of_pzfields 7417 ): 7418 sql_set_info.append( 7419 f""" 7420 concat( 7421 '{pz_prefix}Class=', 7422 CASE 7423 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7424 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7425 ELSE '.' 7426 END 7427 ) 7428 """ 7429 ) 7430 7431 # PZComment 7432 if ( 7433 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7434 in list_of_pzfields 7435 ): 7436 sql_set_info.append( 7437 f""" 7438 CASE 7439 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7440 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7441 ELSE '' 7442 END 7443 """ 7444 ) 7445 if ( 7446 profile == default_profile 7447 and f"{pz_prefix}Comment" in list_of_pzfields 7448 ): 7449 sql_set_info.append( 7450 f""" 7451 CASE 7452 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7453 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7454 ELSE '' 7455 END 7456 """ 7457 ) 7458 7459 # PZInfos 7460 if ( 7461 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7462 in list_of_pzfields 7463 ): 7464 sql_set_info.append( 7465 f""" 7466 CASE 7467 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7468 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7469 ELSE '' 7470 END 7471 """ 7472 ) 7473 if ( 7474 profile == default_profile 7475 and f"{pz_prefix}Infos" in list_of_pzfields 7476 ): 7477 sql_set_info.append( 7478 f""" 7479 CASE 7480 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7481 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7482 ELSE '' 7483 END 7484 """ 7485 ) 7486 7487 # Merge PZfields 7488 sql_set_info_option = "" 7489 sql_set_sep = "" 7490 for sql_set in sql_set_info: 7491 if sql_set_sep: 7492 sql_set_info_option += f""" 7493 , concat('{sql_set_sep}', {sql_set}) 7494 """ 7495 else: 7496 sql_set_info_option += f""" 7497 , {sql_set} 7498 """ 7499 sql_set_sep = ";" 7500 7501 sql_queries = [] 7502 criterion_fields_profile = [] 7503 annotation_view_name = ( 7504 "annotation_view_for_prioritization_" 7505 + str(random.randrange(1000)) 7506 ) 7507 annotation_view_prefix = "" 7508 for annotation in prioritizations_config[profile]: 7509 7510 # skip special sections 7511 if annotation.startswith("_"): 7512 continue 7513 7514 # For each criterions 7515 for criterion in prioritizations_config[profile][ 7516 annotation 7517 ]: 7518 7519 # Criterion mode 7520 criterion_mode = None 7521 if np.any( 7522 np.isin(list(criterion.keys()), ["type", "value"]) 7523 ): 7524 criterion_mode = "operation" 7525 elif np.any( 7526 np.isin(list(criterion.keys()), ["sql", "fields"]) 7527 ): 7528 criterion_mode = "sql" 7529 log.debug(f"Criterion Mode: {criterion_mode}") 7530 7531 # Criterion parameters 7532 criterion_type = criterion.get("type", None) 7533 criterion_value = criterion.get("value", None) 7534 criterion_sql = criterion.get("sql", None) 7535 criterion_fields = criterion.get("fields", None) 7536 criterion_score = criterion.get("score", 0) 7537 criterion_flag = criterion.get("flag", "PASS") 7538 criterion_class = criterion.get("class", None) 7539 criterion_flag_bool = criterion_flag == "PASS" 7540 criterion_comment = ( 7541 ", ".join(criterion.get("comment", [])) 7542 .replace("'", "''") 7543 .replace(";", ",") 7544 .replace("\t", " ") 7545 ) 7546 criterion_infos = ( 7547 str(criterion) 7548 .replace("'", "''") 7549 .replace(";", ",") 7550 .replace("\t", " ") 7551 ) 7552 7553 # SQL 7554 if criterion_sql is not None and isinstance( 7555 criterion_sql, list 7556 ): 7557 criterion_sql = " ".join(criterion_sql) 7558 7559 # Fields and explode 7560 if criterion_fields is None: 7561 criterion_fields = [annotation] 7562 if not isinstance(criterion_fields, list): 7563 criterion_fields = str(criterion_fields).split(",") 7564 7565 # Class 7566 if criterion_class is not None and not isinstance( 7567 criterion_class, list 7568 ): 7569 criterion_class = str(criterion_class).split(",") 7570 7571 # Add criterion fields to the list of profile's criteria 7572 criterion_fields_profile = list( 7573 set(criterion_fields_profile + criterion_fields) 7574 ) 7575 7576 sql_set = [] 7577 sql_set_info = [] 7578 7579 # PZ fields set 7580 7581 # PZScore 7582 if ( 7583 f"{pz_prefix}Score{pzfields_sep}{profile}" 7584 in list_of_pzfields 7585 ): 7586 # VaRank prioritization score mode 7587 if prioritization_score_mode.upper().strip() in [ 7588 "VARANK", 7589 "MAX", 7590 "MAXIMUM", 7591 "TOP", 7592 ]: 7593 sql_set.append( 7594 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7595 ) 7596 # default HOWARD prioritization score mode 7597 else: 7598 sql_set.append( 7599 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7600 ) 7601 7602 # PZFlag 7603 if ( 7604 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7605 in list_of_pzfields 7606 ): 7607 sql_set.append( 7608 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7609 ) 7610 7611 # PZClass 7612 if ( 7613 f"{pz_prefix}Class{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 and criterion_class is not None 7616 ): 7617 sql_set.append( 7618 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7619 ) 7620 7621 # PZComment 7622 if ( 7623 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7624 in list_of_pzfields 7625 ): 7626 sql_set.append( 7627 f""" 7628 {pz_prefix}Comment{pzfields_sep}{profile} = 7629 concat( 7630 {pz_prefix}Comment{pzfields_sep}{profile}, 7631 CASE 7632 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7633 THEN ', ' 7634 ELSE '' 7635 END, 7636 '{criterion_comment}' 7637 ) 7638 """ 7639 ) 7640 7641 # PZInfos 7642 if ( 7643 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7644 in list_of_pzfields 7645 ): 7646 sql_set.append( 7647 f""" 7648 {pz_prefix}Infos{pzfields_sep}{profile} = 7649 concat( 7650 {pz_prefix}Infos{pzfields_sep}{profile}, 7651 '{criterion_infos}' 7652 ) 7653 """ 7654 ) 7655 sql_set_option = ",".join(sql_set) 7656 7657 # Criterion and comparison 7658 if sql_set_option: 7659 7660 # Operation mode 7661 if criterion_mode in ["operation"]: 7662 7663 # Check if value is a float 7664 try: 7665 float(criterion_value) 7666 sql_update = f""" 7667 UPDATE "{table_variants}" 7668 SET {sql_set_option} 7669 FROM ( 7670 SELECT * 7671 FROM "{annotation_view_name}" 7672 WHERE ( 7673 CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7674 AND CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7675 ) 7676 ) AS "{annotation_view_name}" 7677 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7678 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7679 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7680 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7681 7682 """ 7683 # If not a floatÃ’ 7684 except: 7685 contains_option = "" 7686 if criterion_type == "contains": 7687 contains_option = ".*" 7688 sql_update = f""" 7689 UPDATE "{table_variants}" 7690 SET {sql_set_option} 7691 FROM ( 7692 SELECT * 7693 FROM "{annotation_view_name}" 7694 WHERE ( 7695 "{annotation_view_name}"."{annotation_view_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7696 ) 7697 ) AS "{annotation_view_name}" 7698 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7699 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7700 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7701 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7702 7703 """ 7704 sql_queries.append(sql_update) 7705 7706 # SQL mode 7707 elif criterion_mode in ["sql"]: 7708 7709 sql_update = f""" 7710 UPDATE {table_variants} 7711 SET {sql_set_option} 7712 FROM ( 7713 SELECT * 7714 FROM "{annotation_view_name}" 7715 WHERE ({criterion_sql}) 7716 ) AS "{annotation_view_name}" 7717 WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM" 7718 AND "{table_variants}"."POS" == "{annotation_view_name}"."POS" 7719 AND "{table_variants}"."REF" == "{annotation_view_name}"."REF" 7720 AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 7721 """ 7722 sql_queries.append(sql_update) 7723 7724 else: 7725 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7726 log.error(msg_err) 7727 raise ValueError(msg_err) 7728 7729 else: 7730 log.warning( 7731 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7732 ) 7733 7734 # PZTags 7735 if ( 7736 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7737 in list_of_pzfields 7738 ): 7739 7740 # Create PZFalgs value 7741 pztags_value = "" 7742 pztags_sep_default = "," 7743 pztags_sep = "" 7744 for pzfield in pzfields: 7745 if pzfield not in [f"{pz_prefix}Tags"]: 7746 if ( 7747 f"{pzfield}{pzfields_sep}{profile}" 7748 in list_of_pzfields 7749 ): 7750 if pzfield in [f"{pz_prefix}Flag"]: 7751 pztags_value += f"""{pztags_sep}{pzfield}#', 7752 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7753 THEN 'PASS' 7754 ELSE 'FILTERED' 7755 END, '""" 7756 elif pzfield in [f"{pz_prefix}Class"]: 7757 pztags_value += f"""{pztags_sep}{pzfield}#', 7758 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7759 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7760 ELSE '.' 7761 END, '""" 7762 else: 7763 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7764 pztags_sep = pztags_sep_default 7765 7766 # Add Query update for PZFlags 7767 sql_update_pztags = f""" 7768 UPDATE {table_variants} 7769 SET INFO = concat( 7770 INFO, 7771 CASE WHEN INFO NOT in ('','.') 7772 THEN ';' 7773 ELSE '' 7774 END, 7775 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7776 ) 7777 WHERE 1=1 7778 """ 7779 sql_queries.append(sql_update_pztags) 7780 7781 # Add Query update for PZFlags for default 7782 if profile == default_profile: 7783 sql_update_pztags_default = f""" 7784 UPDATE {table_variants} 7785 SET INFO = concat( 7786 INFO, 7787 ';', 7788 '{pz_prefix}Tags={pztags_value}' 7789 ) 7790 WHERE 1=1 7791 """ 7792 sql_queries.append(sql_update_pztags_default) 7793 7794 log.info(f"""Profile '{profile}' - Prioritization... """) 7795 7796 # Create annotations view for prioritization 7797 log.debug( 7798 f"""Profile '{profile}' - Prioritization - Create '{annotation_view_name}' view with '{criterion_fields_profile}'... """ 7799 ) 7800 annotation_view = self.create_annotations_view( 7801 view=annotation_view_name, 7802 prefix=annotation_view_prefix, 7803 fields=criterion_fields_profile, 7804 drop_view=True, 7805 ) 7806 7807 # Chromosomes list 7808 sql_uniq_chrom = f""" 7809 SELECT DISTINCT "#CHROM" 7810 FROM {table_variants} 7811 """ 7812 chroms = self.get_query_to_df(sql_uniq_chrom)["#CHROM"].tolist() 7813 7814 for chrom in chroms: 7815 7816 log.debug( 7817 f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}'... """ 7818 ) 7819 7820 if sql_queries: 7821 7822 # Query num 7823 num_query = 0 7824 7825 # For each query 7826 for sql_query in sql_queries: 7827 7828 # Query num 7829 num_query += 1 7830 7831 sql_query_chrom = f""" 7832 {sql_query} 7833 AND {table_variants}."#CHROM" LIKE '{chrom}' 7834 """ 7835 log.debug( 7836 f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}' [{num_query}/{len(sql_queries)}]""" 7837 ) 7838 # log.debug(f"""sql_query_chrom: {sql_query_chrom}""") 7839 self.execute_query(query=sql_query_chrom) 7840 7841 # Update INFO field 7842 log.info(f"""Profile '{profile}' - Update... """) 7843 sql_query_update = f""" 7844 UPDATE {table_variants} 7845 SET INFO = 7846 concat( 7847 CASE 7848 WHEN INFO NOT IN ('','.') 7849 THEN concat(INFO, ';') 7850 ELSE '' 7851 END 7852 {sql_set_info_option} 7853 ) 7854 """ 7855 # log.debug(f"sql_query_update={sql_query_update}") 7856 self.execute_query(query=sql_query_update) 7857 7858 # Remove annotations view for prioritization 7859 query_drop_tmp_table = f""" 7860 DROP VIEW IF EXISTS {annotation_view_name} 7861 """ 7862 self.execute_query(query=query_drop_tmp_table) 7863 7864 else: 7865 7866 log.warning(f"No profiles in parameters") 7867 7868 # Remove added columns 7869 for added_column in added_columns: 7870 self.drop_column(column=added_column) 7871 7872 # Explode INFOS fields into table fields 7873 if self.get_explode_infos(): 7874 self.explode_infos( 7875 prefix=self.get_explode_infos_prefix(), 7876 fields=self.get_explode_infos_fields(), 7877 force=True, 7878 ) 7879 7880 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7886 def annotation_hgvs(self, threads: int = None) -> None: 7887 """ 7888 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7889 coordinates and alleles. 7890 7891 :param threads: The `threads` parameter is an optional integer that specifies the number of 7892 threads to use for parallel processing. If no value is provided, it will default to the number 7893 of threads obtained from the `get_threads()` method 7894 :type threads: int 7895 """ 7896 7897 # Function for each partition of the Dask Dataframe 7898 def partition_function(partition): 7899 """ 7900 The function `partition_function` applies the `annotation_hgvs_partition` function to 7901 each row of a DataFrame called `partition`. 7902 7903 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7904 to be processed 7905 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7906 the "partition" dataframe along the axis 1. 7907 """ 7908 return partition.apply(annotation_hgvs_partition, axis=1) 7909 7910 def annotation_hgvs_partition(row) -> str: 7911 """ 7912 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7913 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7914 7915 :param row: A dictionary-like object that contains the values for the following keys: 7916 :return: a string that contains the HGVS names associated with the given row of data. 7917 """ 7918 7919 chr = row["CHROM"] 7920 pos = row["POS"] 7921 ref = row["REF"] 7922 alt = row["ALT"] 7923 7924 # Find list of associated transcripts 7925 transcripts_list = list( 7926 polars_conn.execute( 7927 f""" 7928 SELECT transcript 7929 FROM refseq_df 7930 WHERE CHROM='{chr}' 7931 AND POS={pos} 7932 """ 7933 )["transcript"] 7934 ) 7935 7936 # Full HGVS annotation in list 7937 hgvs_full_list = [] 7938 7939 for transcript_name in transcripts_list: 7940 7941 # Transcript 7942 transcript = get_transcript( 7943 transcripts=transcripts, transcript_name=transcript_name 7944 ) 7945 # Exon 7946 if use_exon: 7947 exon = transcript.find_exon_number(pos) 7948 else: 7949 exon = None 7950 # Protein 7951 transcript_protein = None 7952 if use_protein or add_protein or full_format: 7953 transcripts_protein = list( 7954 polars_conn.execute( 7955 f""" 7956 SELECT protein 7957 FROM refseqlink_df 7958 WHERE transcript='{transcript_name}' 7959 LIMIT 1 7960 """ 7961 )["protein"] 7962 ) 7963 if len(transcripts_protein): 7964 transcript_protein = transcripts_protein[0] 7965 7966 # HGVS name 7967 hgvs_name = format_hgvs_name( 7968 chr, 7969 pos, 7970 ref, 7971 alt, 7972 genome=genome, 7973 transcript=transcript, 7974 transcript_protein=transcript_protein, 7975 exon=exon, 7976 use_gene=use_gene, 7977 use_protein=use_protein, 7978 full_format=full_format, 7979 use_version=use_version, 7980 codon_type=codon_type, 7981 ) 7982 hgvs_full_list.append(hgvs_name) 7983 if add_protein and not use_protein and not full_format: 7984 hgvs_name = format_hgvs_name( 7985 chr, 7986 pos, 7987 ref, 7988 alt, 7989 genome=genome, 7990 transcript=transcript, 7991 transcript_protein=transcript_protein, 7992 exon=exon, 7993 use_gene=use_gene, 7994 use_protein=True, 7995 full_format=False, 7996 use_version=use_version, 7997 codon_type=codon_type, 7998 ) 7999 hgvs_full_list.append(hgvs_name) 8000 8001 # Create liste of HGVS annotations 8002 hgvs_full = ",".join(hgvs_full_list) 8003 8004 return hgvs_full 8005 8006 # Polars connexion 8007 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8008 8009 # Config 8010 config = self.get_config() 8011 8012 # Databases 8013 # Genome 8014 databases_genomes_folders = ( 8015 config.get("folders", {}) 8016 .get("databases", {}) 8017 .get("genomes", DEFAULT_GENOME_FOLDER) 8018 ) 8019 databases_genome = ( 8020 config.get("folders", {}).get("databases", {}).get("genomes", "") 8021 ) 8022 # refseq database folder 8023 databases_refseq_folders = ( 8024 config.get("folders", {}) 8025 .get("databases", {}) 8026 .get("refseq", DEFAULT_REFSEQ_FOLDER) 8027 ) 8028 # refseq 8029 databases_refseq = config.get("databases", {}).get("refSeq", None) 8030 # refSeqLink 8031 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 8032 8033 # Param 8034 param = self.get_param() 8035 8036 # Quick HGVS 8037 if "hgvs_options" in param and param.get("hgvs_options", ""): 8038 log.info(f"Quick HGVS Annotation:") 8039 if not param.get("hgvs", None): 8040 param["hgvs"] = {} 8041 for option in param.get("hgvs_options", "").split(","): 8042 option_var_val = option.split("=") 8043 option_var = option_var_val[0] 8044 if len(option_var_val) > 1: 8045 option_val = option_var_val[1] 8046 else: 8047 option_val = "True" 8048 if option_val.upper() in ["TRUE"]: 8049 option_val = True 8050 elif option_val.upper() in ["FALSE"]: 8051 option_val = False 8052 log.info(f" {option_var}={option_val}") 8053 param["hgvs"][option_var] = option_val 8054 8055 # Check if HGVS annotation enabled 8056 if "hgvs" in param: 8057 log.info(f"HGVS Annotation... ") 8058 for hgvs_option in param.get("hgvs", {}): 8059 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 8060 else: 8061 return 8062 8063 # HGVS Param 8064 param_hgvs = param.get("hgvs", {}) 8065 use_exon = param_hgvs.get("use_exon", False) 8066 use_gene = param_hgvs.get("use_gene", False) 8067 use_protein = param_hgvs.get("use_protein", False) 8068 add_protein = param_hgvs.get("add_protein", False) 8069 full_format = param_hgvs.get("full_format", False) 8070 use_version = param_hgvs.get("use_version", False) 8071 codon_type = param_hgvs.get("codon_type", "3") 8072 8073 # refSseq refSeqLink 8074 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8075 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8076 8077 # Assembly 8078 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8079 8080 # Genome 8081 genome_file = None 8082 if find_genome(databases_genome): 8083 genome_file = find_genome(databases_genome) 8084 else: 8085 genome_file = find_genome( 8086 genome_path=databases_genomes_folders, assembly=assembly 8087 ) 8088 log.debug("Genome: " + str(genome_file)) 8089 8090 # refSseq 8091 refseq_file = find_file_prefix( 8092 input_file=databases_refseq, 8093 prefix="ncbiRefSeq", 8094 folder=databases_refseq_folders, 8095 assembly=assembly, 8096 ) 8097 log.debug("refSeq: " + str(refseq_file)) 8098 8099 # refSeqLink 8100 refseqlink_file = find_file_prefix( 8101 input_file=databases_refseqlink, 8102 prefix="ncbiRefSeqLink", 8103 folder=databases_refseq_folders, 8104 assembly=assembly, 8105 ) 8106 log.debug("refSeqLink: " + str(refseqlink_file)) 8107 8108 # Threads 8109 if not threads: 8110 threads = self.get_threads() 8111 log.debug("Threads: " + str(threads)) 8112 8113 # Variables 8114 table_variants = self.get_table_variants(clause="update") 8115 8116 # Get variants SNV and InDel only 8117 query_variants = f""" 8118 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8119 FROM {table_variants} 8120 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8121 """ 8122 df_variants = self.get_query_to_df(query_variants) 8123 8124 # Added columns 8125 added_columns = [] 8126 8127 # Add hgvs column in variants table 8128 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8129 added_column = self.add_column( 8130 table_variants, hgvs_column_name, "STRING", default_value=None 8131 ) 8132 added_columns.append(added_column) 8133 8134 log.debug(f"refSeq loading...") 8135 # refSeq in duckDB 8136 refseq_table = get_refseq_table( 8137 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8138 ) 8139 # Loading all refSeq in Dataframe 8140 refseq_query = f""" 8141 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8142 FROM {refseq_table} 8143 JOIN df_variants ON ( 8144 {refseq_table}.chrom = df_variants.CHROM 8145 AND {refseq_table}.txStart<=df_variants.POS 8146 AND {refseq_table}.txEnd>=df_variants.POS 8147 ) 8148 """ 8149 refseq_df = self.conn.query(refseq_query).pl() 8150 8151 if refseqlink_file: 8152 log.debug(f"refSeqLink loading...") 8153 # refSeqLink in duckDB 8154 refseqlink_table = get_refseq_table( 8155 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8156 ) 8157 # Loading all refSeqLink in Dataframe 8158 protacc_column = "protAcc_with_ver" 8159 mrnaacc_column = "mrnaAcc_with_ver" 8160 refseqlink_query = f""" 8161 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8162 FROM {refseqlink_table} 8163 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8164 WHERE protAcc_without_ver IS NOT NULL 8165 """ 8166 # Polars Dataframe 8167 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8168 8169 # Read RefSeq transcripts into a python dict/model. 8170 log.debug(f"Transcripts loading...") 8171 with tempfile.TemporaryDirectory() as tmpdir: 8172 transcripts_query = f""" 8173 COPY ( 8174 SELECT {refseq_table}.* 8175 FROM {refseq_table} 8176 JOIN df_variants ON ( 8177 {refseq_table}.chrom=df_variants.CHROM 8178 AND {refseq_table}.txStart<=df_variants.POS 8179 AND {refseq_table}.txEnd>=df_variants.POS 8180 ) 8181 ) 8182 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8183 """ 8184 self.conn.query(transcripts_query) 8185 with open(f"{tmpdir}/transcript.tsv") as infile: 8186 transcripts = read_transcripts(infile) 8187 8188 # Polars connexion 8189 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8190 8191 log.debug("Genome loading...") 8192 # Read genome sequence using pyfaidx. 8193 genome = Fasta(genome_file) 8194 8195 log.debug("Start annotation HGVS...") 8196 8197 # Create 8198 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8199 ddf = dd.from_pandas(df_variants, npartitions=threads) 8200 8201 # Use dask.dataframe.apply() to apply function on each partition 8202 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8203 8204 # Convert Dask DataFrame to Pandas Dataframe 8205 df = ddf.compute() 8206 8207 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8208 with tempfile.TemporaryDirectory() as tmpdir: 8209 df_parquet = os.path.join(tmpdir, "df.parquet") 8210 df.to_parquet(df_parquet) 8211 8212 # Update hgvs column 8213 update_variant_query = f""" 8214 UPDATE {table_variants} 8215 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8216 FROM read_parquet('{df_parquet}') as df 8217 WHERE variants."#CHROM" = df.CHROM 8218 AND variants.POS = df.POS 8219 AND variants.REF = df.REF 8220 AND variants.ALT = df.ALT 8221 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8222 """ 8223 self.execute_query(update_variant_query) 8224 8225 # Update INFO column 8226 sql_query_update = f""" 8227 UPDATE {table_variants} 8228 SET INFO = 8229 concat( 8230 CASE 8231 WHEN INFO NOT IN ('','.') 8232 THEN concat(INFO, ';') 8233 ELSE '' 8234 END, 8235 'hgvs=', 8236 {hgvs_column_name} 8237 ) 8238 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8239 """ 8240 self.execute_query(sql_query_update) 8241 8242 # Add header 8243 HGVS_INFOS = { 8244 "hgvs": { 8245 "ID": "hgvs", 8246 "Number": ".", 8247 "Type": "String", 8248 "Description": f"HGVS annotatation with HOWARD", 8249 } 8250 } 8251 8252 for field in HGVS_INFOS: 8253 field_ID = HGVS_INFOS[field]["ID"] 8254 field_description = HGVS_INFOS[field]["Description"] 8255 self.get_header().infos[field_ID] = vcf.parser._Info( 8256 field_ID, 8257 HGVS_INFOS[field]["Number"], 8258 HGVS_INFOS[field]["Type"], 8259 field_description, 8260 "unknown", 8261 "unknown", 8262 code_type_map[HGVS_INFOS[field]["Type"]], 8263 ) 8264 8265 # Remove added columns 8266 for added_column in added_columns: 8267 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8273 def get_operations_help( 8274 self, operations_config_dict: dict = {}, operations_config_file: str = None 8275 ) -> list: 8276 8277 # Init 8278 operations_help = [] 8279 8280 # operations 8281 operations = self.get_config_json( 8282 name="calculations", 8283 config_dict=operations_config_dict, 8284 config_file=operations_config_file, 8285 ) 8286 for op in operations: 8287 op_name = operations[op].get("name", op).upper() 8288 op_description = operations[op].get("description", op_name) 8289 op_available = operations[op].get("available", False) 8290 if op_available: 8291 operations_help.append(f" {op_name}: {op_description}") 8292 8293 # Sort operations 8294 operations_help.sort() 8295 8296 # insert header 8297 operations_help.insert(0, "Available calculation operations:") 8298 8299 # Return 8300 return operations_help
8302 def calculation( 8303 self, 8304 operations: dict = {}, 8305 operations_config_dict: dict = {}, 8306 operations_config_file: str = None, 8307 ) -> None: 8308 """ 8309 It takes a list of operations, and for each operation, it checks if it's a python or sql 8310 operation, and then calls the appropriate function 8311 8312 param json example: 8313 "calculation": { 8314 "NOMEN": { 8315 "options": { 8316 "hgvs_field": "hgvs" 8317 }, 8318 "middle" : null 8319 } 8320 """ 8321 8322 # Param 8323 param = self.get_param() 8324 8325 # CHeck operations config file 8326 if operations_config_file is None: 8327 operations_config_file = param.get("calculation", {}).get( 8328 "calculation_config", None 8329 ) 8330 8331 # operations config 8332 operations_config = self.get_config_json( 8333 name="calculations", 8334 config_dict=operations_config_dict, 8335 config_file=operations_config_file, 8336 ) 8337 8338 # Upper keys 8339 operations_config = {k.upper(): v for k, v in operations_config.items()} 8340 8341 # Calculations 8342 8343 # Operations from param 8344 operations = param.get("calculation", {}).get("calculations", operations) 8345 8346 # Quick calculation - add 8347 if param.get("calculations", None): 8348 8349 # List of operations 8350 calculations_list = [ 8351 value.strip() for value in param.get("calculations", "").split(",") 8352 ] 8353 8354 # Log 8355 log.info(f"Quick Calculations:") 8356 for calculation_key in calculations_list: 8357 log.info(f" {calculation_key}") 8358 8359 # Create tmp operations (to keep operation order) 8360 operations_tmp = {} 8361 for calculation_operation in calculations_list: 8362 if calculation_operation.upper() not in operations_tmp: 8363 log.debug( 8364 f"{calculation_operation}.upper() not in {operations_tmp}" 8365 ) 8366 operations_tmp[calculation_operation.upper()] = {} 8367 add_value_into_dict( 8368 dict_tree=operations_tmp, 8369 sections=[ 8370 calculation_operation.upper(), 8371 ], 8372 value=operations.get(calculation_operation.upper(), {}), 8373 ) 8374 # Add operations already in param 8375 for calculation_operation in operations: 8376 if calculation_operation not in operations_tmp: 8377 operations_tmp[calculation_operation] = operations.get( 8378 calculation_operation, {} 8379 ) 8380 8381 # Update operations in param 8382 operations = operations_tmp 8383 8384 # Operations for calculation 8385 if not operations: 8386 operations = param.get("calculation", {}).get("calculations", {}) 8387 8388 if operations: 8389 log.info(f"Calculations...") 8390 8391 # For each operations 8392 for operation_name in operations: 8393 operation_name = operation_name.upper() 8394 if operation_name not in [""]: 8395 if operation_name in operations_config: 8396 log.info(f"Calculation '{operation_name}'") 8397 operation = operations_config[operation_name] 8398 operation_type = operation.get("type", "sql") 8399 if operation_type == "python": 8400 self.calculation_process_function( 8401 operation=operation, operation_name=operation_name 8402 ) 8403 elif operation_type == "sql": 8404 self.calculation_process_sql( 8405 operation=operation, operation_name=operation_name 8406 ) 8407 else: 8408 log.error( 8409 f"Operations config: Type '{operation_type}' NOT available" 8410 ) 8411 raise ValueError( 8412 f"Operations config: Type '{operation_type}' NOT available" 8413 ) 8414 else: 8415 log.error( 8416 f"Operations config: Calculation '{operation_name}' NOT available" 8417 ) 8418 raise ValueError( 8419 f"Operations config: Calculation '{operation_name}' NOT available" 8420 ) 8421 8422 # Explode INFOS fields into table fields 8423 if self.get_explode_infos(): 8424 self.explode_infos( 8425 prefix=self.get_explode_infos_prefix(), 8426 fields=self.get_explode_infos_fields(), 8427 force=True, 8428 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8430 def calculation_process_sql( 8431 self, operation: dict, operation_name: str = "unknown" 8432 ) -> None: 8433 """ 8434 The `calculation_process_sql` function takes in a mathematical operation as a string and 8435 performs the operation, updating the specified table with the result. 8436 8437 :param operation: The `operation` parameter is a dictionary that contains information about the 8438 mathematical operation to be performed. It includes the following keys: 8439 :type operation: dict 8440 :param operation_name: The `operation_name` parameter is a string that represents the name of 8441 the mathematical operation being performed. It is used for logging and error handling purposes, 8442 defaults to unknown 8443 :type operation_name: str (optional) 8444 """ 8445 8446 # Operation infos 8447 operation_name = operation.get("name", "unknown") 8448 log.debug(f"process SQL {operation_name}") 8449 output_column_name = operation.get("output_column_name", operation_name) 8450 output_column_type = operation.get("output_column_type", "String") 8451 prefix = operation.get("explode_infos_prefix", "") 8452 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8453 output_column_description = operation.get( 8454 "output_column_description", f"{operation_name} operation" 8455 ) 8456 operation_query = operation.get("operation_query", None) 8457 if isinstance(operation_query, list): 8458 operation_query = " ".join(operation_query) 8459 operation_info_fields = operation.get("info_fields", []) 8460 operation_info_fields_check = operation.get("info_fields_check", False) 8461 operation_info = operation.get("operation_info", True) 8462 operation_table = operation.get( 8463 "table", self.get_table_variants(clause="alter") 8464 ) 8465 8466 # table variants 8467 if operation_table: 8468 table_variants = operation_table 8469 else: 8470 table_variants = self.get_table_variants(clause="alter") 8471 8472 if operation_query: 8473 8474 # Info fields check 8475 operation_info_fields_check_result = True 8476 if operation_info_fields_check: 8477 header_infos = self.get_header().infos 8478 for info_field in operation_info_fields: 8479 operation_info_fields_check_result = ( 8480 operation_info_fields_check_result 8481 and info_field in header_infos 8482 ) 8483 8484 # If info fields available 8485 if operation_info_fields_check_result: 8486 8487 # Added_columns 8488 added_columns = [] 8489 8490 # Create VCF header field 8491 vcf_reader = self.get_header() 8492 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8493 output_column_name, 8494 ".", 8495 output_column_type, 8496 output_column_description, 8497 "howard calculation", 8498 "0", 8499 self.code_type_map.get(output_column_type), 8500 ) 8501 8502 # Explode infos if needed 8503 log.debug(f"calculation_process_sql prefix {prefix}") 8504 added_columns += self.explode_infos( 8505 prefix=prefix, 8506 fields=[output_column_name] + operation_info_fields, 8507 force=False, 8508 table=table_variants, 8509 ) 8510 8511 # Create column 8512 added_column = self.add_column( 8513 table_name=table_variants, 8514 column_name=prefix + output_column_name, 8515 column_type=output_column_type_sql, 8516 default_value="null", 8517 ) 8518 added_columns.append(added_column) 8519 8520 # Operation calculation 8521 try: 8522 8523 # Query to update calculation column 8524 sql_update = f""" 8525 UPDATE {table_variants} 8526 SET "{prefix}{output_column_name}" = ({operation_query}) 8527 """ 8528 self.conn.execute(sql_update) 8529 8530 # Add to INFO 8531 if operation_info: 8532 sql_update_info = f""" 8533 UPDATE {table_variants} 8534 SET "INFO" = 8535 concat( 8536 CASE 8537 WHEN "INFO" IS NOT NULL 8538 THEN concat("INFO", ';') 8539 ELSE '' 8540 END, 8541 '{output_column_name}=', 8542 "{prefix}{output_column_name}" 8543 ) 8544 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8545 """ 8546 self.conn.execute(sql_update_info) 8547 8548 except: 8549 log.error( 8550 f"Operations config: Calculation '{operation_name}' query failed" 8551 ) 8552 raise ValueError( 8553 f"Operations config: Calculation '{operation_name}' query failed" 8554 ) 8555 8556 # Remove added columns 8557 for added_column in added_columns: 8558 log.debug(f"added_column: {added_column}") 8559 self.drop_column(column=added_column) 8560 8561 else: 8562 log.error( 8563 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8564 ) 8565 raise ValueError( 8566 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8567 ) 8568 8569 else: 8570 log.error( 8571 f"Operations config: Calculation '{operation_name}' query NOT defined" 8572 ) 8573 raise ValueError( 8574 f"Operations config: Calculation '{operation_name}' query NOT defined" 8575 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8577 def calculation_process_function( 8578 self, operation: dict, operation_name: str = "unknown" 8579 ) -> None: 8580 """ 8581 The `calculation_process_function` takes in an operation dictionary and performs the specified 8582 function with the given parameters. 8583 8584 :param operation: The `operation` parameter is a dictionary that contains information about the 8585 operation to be performed. It has the following keys: 8586 :type operation: dict 8587 :param operation_name: The `operation_name` parameter is a string that represents the name of 8588 the operation being performed. It is used for logging purposes, defaults to unknown 8589 :type operation_name: str (optional) 8590 """ 8591 8592 operation_name = operation["name"] 8593 log.debug(f"process Python {operation_name}") 8594 function_name = operation["function_name"] 8595 function_params = operation["function_params"] 8596 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8598 def calculation_variant_id(self) -> None: 8599 """ 8600 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8601 updates the INFO field of a variants table with the variant ID. 8602 """ 8603 8604 # variant_id annotation field 8605 variant_id_tag = self.get_variant_id_column() 8606 added_columns = [variant_id_tag] 8607 8608 # variant_id hgvs tags" 8609 vcf_infos_tags = { 8610 variant_id_tag: "howard variant ID annotation", 8611 } 8612 8613 # Variants table 8614 table_variants = self.get_table_variants() 8615 8616 # Header 8617 vcf_reader = self.get_header() 8618 8619 # Add variant_id to header 8620 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8621 variant_id_tag, 8622 ".", 8623 "String", 8624 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8625 "howard calculation", 8626 "0", 8627 self.code_type_map.get("String"), 8628 ) 8629 8630 # Update 8631 sql_update = f""" 8632 UPDATE {table_variants} 8633 SET "INFO" = 8634 concat( 8635 CASE 8636 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8637 THEN '' 8638 ELSE concat("INFO", ';') 8639 END, 8640 '{variant_id_tag}=', 8641 "{variant_id_tag}" 8642 ) 8643 """ 8644 self.conn.execute(sql_update) 8645 8646 # Remove added columns 8647 for added_column in added_columns: 8648 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8650 def calculation_extract_snpeff_hgvs( 8651 self, 8652 snpeff_hgvs: str = "snpeff_hgvs", 8653 snpeff_field: str = "ANN", 8654 ) -> None: 8655 """ 8656 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8657 annotation field in a VCF file and adds them as a new column in the variants table. 8658 8659 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8660 function is used to specify the name of the column that will store the HGVS nomenclatures 8661 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8662 snpeff_hgvs 8663 :type snpeff_hgvs: str (optional) 8664 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8665 function represents the field in the VCF file that contains SnpEff annotations. This field is 8666 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8667 to ANN 8668 :type snpeff_field: str (optional) 8669 """ 8670 8671 # Snpeff hgvs tags 8672 vcf_infos_tags = { 8673 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8674 } 8675 8676 # Prefix 8677 prefix = self.get_explode_infos_prefix() 8678 if prefix: 8679 prefix = "INFO/" 8680 8681 # snpEff fields 8682 speff_ann_infos = prefix + snpeff_field 8683 speff_hgvs_infos = prefix + snpeff_hgvs 8684 8685 # Variants table 8686 table_variants = self.get_table_variants() 8687 8688 # Header 8689 vcf_reader = self.get_header() 8690 8691 # Add columns 8692 added_columns = [] 8693 8694 # Explode HGVS field in column 8695 added_columns += self.explode_infos(fields=[snpeff_field]) 8696 8697 if snpeff_field in vcf_reader.infos: 8698 8699 log.debug(vcf_reader.infos[snpeff_field]) 8700 8701 # Extract ANN header 8702 ann_description = vcf_reader.infos[snpeff_field].desc 8703 pattern = r"'(.+?)'" 8704 match = re.search(pattern, ann_description) 8705 if match: 8706 ann_header_match = match.group(1).split(" | ") 8707 ann_header_desc = {} 8708 for i in range(len(ann_header_match)): 8709 ann_header_info = "".join( 8710 char for char in ann_header_match[i] if char.isalnum() 8711 ) 8712 ann_header_desc[ann_header_info] = ann_header_match[i] 8713 if not ann_header_desc: 8714 raise ValueError("Invalid header description format") 8715 else: 8716 raise ValueError("Invalid header description format") 8717 8718 # Create variant id 8719 variant_id_column = self.get_variant_id_column() 8720 added_columns += [variant_id_column] 8721 8722 # Create dataframe 8723 dataframe_snpeff_hgvs = self.get_query_to_df( 8724 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8725 ) 8726 8727 # Create main NOMEN column 8728 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8729 speff_ann_infos 8730 ].apply( 8731 lambda x: extract_snpeff_hgvs( 8732 str(x), header=list(ann_header_desc.values()) 8733 ) 8734 ) 8735 8736 # Add snpeff_hgvs to header 8737 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8738 snpeff_hgvs, 8739 ".", 8740 "String", 8741 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8742 "howard calculation", 8743 "0", 8744 self.code_type_map.get("String"), 8745 ) 8746 8747 # Update 8748 sql_update = f""" 8749 UPDATE variants 8750 SET "INFO" = 8751 concat( 8752 CASE 8753 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8754 THEN '' 8755 ELSE concat("INFO", ';') 8756 END, 8757 CASE 8758 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8759 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8760 THEN concat( 8761 '{snpeff_hgvs}=', 8762 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8763 ) 8764 ELSE '' 8765 END 8766 ) 8767 FROM dataframe_snpeff_hgvs 8768 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8769 8770 """ 8771 self.conn.execute(sql_update) 8772 8773 # Delete dataframe 8774 del dataframe_snpeff_hgvs 8775 gc.collect() 8776 8777 else: 8778 8779 log.warning( 8780 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8781 ) 8782 8783 # Remove added columns 8784 for added_column in added_columns: 8785 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8787 def calculation_snpeff_ann_explode( 8788 self, 8789 uniquify: bool = True, 8790 output_format: str = "fields", 8791 output_prefix: str = "snpeff_", 8792 snpeff_field: str = "ANN", 8793 ) -> None: 8794 """ 8795 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8796 exploding the HGVS field and updating variant information accordingly. 8797 8798 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8799 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8800 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8801 defaults to True 8802 :type uniquify: bool (optional) 8803 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8804 function specifies the format in which the output annotations will be generated. It has a 8805 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8806 format, defaults to fields 8807 :type output_format: str (optional) 8808 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8809 method is used to specify the prefix that will be added to the output annotations generated 8810 during the calculation process. This prefix helps to differentiate the newly added annotations 8811 from existing ones in the output data. By default, the, defaults to ANN_ 8812 :type output_prefix: str (optional) 8813 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8814 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8815 field will be processed to explode the HGVS annotations and update the variant information 8816 accordingly, defaults to ANN 8817 :type snpeff_field: str (optional) 8818 """ 8819 8820 # SnpEff annotation field 8821 snpeff_hgvs = "snpeff_ann_explode" 8822 8823 # Snpeff hgvs tags 8824 vcf_infos_tags = { 8825 snpeff_hgvs: "Explode snpEff annotations", 8826 } 8827 8828 # Prefix 8829 prefix = self.get_explode_infos_prefix() 8830 if prefix: 8831 prefix = "INFO/" 8832 8833 # snpEff fields 8834 speff_ann_infos = prefix + snpeff_field 8835 speff_hgvs_infos = prefix + snpeff_hgvs 8836 8837 # Variants table 8838 table_variants = self.get_table_variants() 8839 8840 # Header 8841 vcf_reader = self.get_header() 8842 8843 # Add columns 8844 added_columns = [] 8845 8846 # Explode HGVS field in column 8847 added_columns += self.explode_infos(fields=[snpeff_field]) 8848 log.debug(f"snpeff_field={snpeff_field}") 8849 log.debug(f"added_columns={added_columns}") 8850 8851 if snpeff_field in vcf_reader.infos: 8852 8853 # Extract ANN header 8854 ann_description = vcf_reader.infos[snpeff_field].desc 8855 pattern = r"'(.+?)'" 8856 match = re.search(pattern, ann_description) 8857 if match: 8858 ann_header_match = match.group(1).split(" | ") 8859 ann_header = [] 8860 ann_header_desc = {} 8861 for i in range(len(ann_header_match)): 8862 ann_header_info = "".join( 8863 char for char in ann_header_match[i] if char.isalnum() 8864 ) 8865 ann_header.append(ann_header_info) 8866 ann_header_desc[ann_header_info] = ann_header_match[i] 8867 if not ann_header_desc: 8868 raise ValueError("Invalid header description format") 8869 else: 8870 raise ValueError("Invalid header description format") 8871 8872 # Create variant id 8873 variant_id_column = self.get_variant_id_column() 8874 added_columns += [variant_id_column] 8875 8876 # Create dataframe 8877 dataframe_snpeff_hgvs = self.get_query_to_df( 8878 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8879 ) 8880 8881 # Create snpEff columns 8882 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8883 speff_ann_infos 8884 ].apply( 8885 lambda x: explode_snpeff_ann( 8886 str(x), 8887 uniquify=uniquify, 8888 output_format=output_format, 8889 prefix=output_prefix, 8890 header=list(ann_header_desc.values()), 8891 ) 8892 ) 8893 8894 # Header 8895 ann_annotations_prefix = "" 8896 if output_format.upper() in ["JSON"]: 8897 ann_annotations_prefix = f"{output_prefix}=" 8898 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8899 output_prefix, 8900 ".", 8901 "String", 8902 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8903 + " - JSON format", 8904 "howard calculation", 8905 "0", 8906 self.code_type_map.get("String"), 8907 ) 8908 else: 8909 for ann_annotation in ann_header: 8910 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8911 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8912 ann_annotation_id, 8913 ".", 8914 "String", 8915 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8916 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8917 "howard calculation", 8918 "0", 8919 self.code_type_map.get("String"), 8920 ) 8921 8922 # Update 8923 sql_update = f""" 8924 UPDATE variants 8925 SET "INFO" = 8926 concat( 8927 CASE 8928 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8929 THEN '' 8930 ELSE concat("INFO", ';') 8931 END, 8932 CASE 8933 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8934 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8935 THEN concat( 8936 '{ann_annotations_prefix}', 8937 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8938 ) 8939 ELSE '' 8940 END 8941 ) 8942 FROM dataframe_snpeff_hgvs 8943 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8944 8945 """ 8946 self.conn.execute(sql_update) 8947 8948 # Delete dataframe 8949 del dataframe_snpeff_hgvs 8950 gc.collect() 8951 8952 else: 8953 8954 log.warning( 8955 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8956 ) 8957 8958 # Remove added columns 8959 for added_column in added_columns: 8960 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8962 def calculation_extract_nomen(self) -> None: 8963 """ 8964 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8965 """ 8966 8967 # NOMEN field 8968 field_nomen_dict = "NOMEN_DICT" 8969 8970 # NOMEN structure 8971 nomen_dict = { 8972 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8973 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8974 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8975 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8976 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8977 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8978 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8979 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8980 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8981 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8982 } 8983 8984 # Param 8985 param = self.get_param() 8986 8987 # Threads 8988 threads = self.get_threads() 8989 8990 # Prefix 8991 prefix = self.get_explode_infos_prefix() 8992 8993 # Header 8994 vcf_reader = self.get_header() 8995 8996 # Added columns 8997 added_columns = [] 8998 8999 # Get HGVS field 9000 hgvs_field = ( 9001 param.get("calculation", {}) 9002 .get("calculations", {}) 9003 .get("NOMEN", {}) 9004 .get("options", {}) 9005 .get("hgvs_field", "hgvs") 9006 ) 9007 9008 # Get NOMEN pattern 9009 nomen_pattern = ( 9010 param.get("calculation", {}) 9011 .get("calculations", {}) 9012 .get("NOMEN", {}) 9013 .get("options", {}) 9014 .get("pattern", None) 9015 ) 9016 9017 # transcripts list of preference sources 9018 transcripts_sources = {} 9019 9020 # Get transcripts 9021 transcripts_file = ( 9022 param.get("calculation", {}) 9023 .get("calculations", {}) 9024 .get("NOMEN", {}) 9025 .get("options", {}) 9026 .get("transcripts", None) 9027 ) 9028 transcripts_file = full_path(transcripts_file) 9029 if transcripts_file: 9030 if os.path.exists(transcripts_file): 9031 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 9032 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 9033 transcripts_sources["file"] = transcripts_from_file 9034 else: 9035 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 9036 log.error(msg_err) 9037 raise ValueError(msg_err) 9038 9039 # Get transcripts table 9040 transcripts_table = ( 9041 param.get("calculation", {}) 9042 .get("calculations", {}) 9043 .get("NOMEN", {}) 9044 .get("options", {}) 9045 .get("transcripts_table", self.get_table_variants()) 9046 ) 9047 # Get transcripts column 9048 transcripts_column = ( 9049 param.get("calculation", {}) 9050 .get("calculations", {}) 9051 .get("NOMEN", {}) 9052 .get("options", {}) 9053 .get("transcripts_column", None) 9054 ) 9055 9056 if transcripts_table and transcripts_column: 9057 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 9058 # Explode if not exists 9059 added_columns += self.explode_infos( 9060 fields=[transcripts_column], table=transcripts_table 9061 ) 9062 else: 9063 extra_field_transcript = f"NULL" 9064 9065 # Transcripts of preference source order 9066 transcripts_order = ( 9067 param.get("calculation", {}) 9068 .get("calculations", {}) 9069 .get("NOMEN", {}) 9070 .get("options", {}) 9071 .get("transcripts_order", ["column", "file"]) 9072 ) 9073 9074 # Transcripts from file 9075 transcripts = transcripts_sources.get("file", []) 9076 9077 # Explode HGVS field in column 9078 added_columns += self.explode_infos(fields=[hgvs_field]) 9079 9080 # extra infos 9081 extra_infos = self.get_extra_infos() 9082 extra_field = prefix + hgvs_field 9083 9084 if extra_field in extra_infos: 9085 9086 # Create dataframe 9087 dataframe_hgvs = self.get_query_to_df( 9088 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9089 ) 9090 9091 # Transcripts rank 9092 transcripts_rank = { 9093 transcript: rank for rank, transcript in enumerate(transcripts, start=1) 9094 } 9095 transcripts_len = len(transcripts_rank) 9096 9097 # Create main NOMEN column 9098 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9099 lambda x: find_nomen( 9100 hgvs=x.hgvs, 9101 transcript=x.transcript, 9102 transcripts=transcripts_rank, 9103 pattern=nomen_pattern, 9104 transcripts_source_order=transcripts_order, 9105 transcripts_len=transcripts_len, 9106 ), 9107 axis=1, 9108 ) 9109 9110 # Explode NOMEN Structure and create SQL set for update 9111 sql_nomen_fields = [] 9112 for nomen_field in nomen_dict: 9113 9114 # Create VCF header field 9115 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9116 nomen_field, 9117 ".", 9118 "String", 9119 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9120 "howard calculation", 9121 "0", 9122 self.code_type_map.get("String"), 9123 ) 9124 9125 # Add field to SQL query update 9126 sql_nomen_fields.append( 9127 f""" 9128 CASE 9129 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9130 THEN concat( 9131 ';{nomen_field}=', 9132 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9133 ) 9134 ELSE '' 9135 END 9136 """ 9137 ) 9138 9139 # SQL set for update 9140 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9141 9142 # Update 9143 sql_update = f""" 9144 UPDATE variants 9145 SET "INFO" = 9146 concat( 9147 CASE 9148 WHEN "INFO" IS NULL 9149 THEN '' 9150 ELSE "INFO" 9151 END, 9152 {sql_nomen_fields_set} 9153 ) 9154 FROM dataframe_hgvs 9155 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9156 AND variants."POS" = dataframe_hgvs."POS" 9157 AND variants."REF" = dataframe_hgvs."REF" 9158 AND variants."ALT" = dataframe_hgvs."ALT" 9159 """ 9160 self.conn.execute(sql_update) 9161 9162 # Delete dataframe 9163 del dataframe_hgvs 9164 gc.collect() 9165 9166 # Remove added columns 9167 for added_column in added_columns: 9168 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9170 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9171 """ 9172 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9173 pipeline/sample for a variant and updates the variant information in a VCF file. 9174 9175 :param tag: The `tag` parameter is a string that represents the annotation field for the 9176 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9177 VCF header and to update the corresponding field in the variants table, defaults to 9178 findbypipeline 9179 :type tag: str (optional) 9180 """ 9181 9182 # if FORMAT and samples 9183 if ( 9184 "FORMAT" in self.get_header_columns_as_list() 9185 and self.get_header_sample_list() 9186 ): 9187 9188 # findbypipeline annotation field 9189 findbypipeline_tag = tag 9190 9191 # VCF infos tags 9192 vcf_infos_tags = { 9193 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9194 } 9195 9196 # Prefix 9197 prefix = self.get_explode_infos_prefix() 9198 9199 # Field 9200 findbypipeline_infos = prefix + findbypipeline_tag 9201 9202 # Variants table 9203 table_variants = self.get_table_variants() 9204 9205 # Header 9206 vcf_reader = self.get_header() 9207 9208 # Create variant id 9209 variant_id_column = self.get_variant_id_column() 9210 added_columns = [variant_id_column] 9211 9212 # variant_id, FORMAT and samples 9213 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9214 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9215 ) 9216 9217 # Create dataframe 9218 dataframe_findbypipeline = self.get_query_to_df( 9219 f""" SELECT {samples_fields} FROM {table_variants} """ 9220 ) 9221 9222 # Create findbypipeline column 9223 dataframe_findbypipeline[findbypipeline_infos] = ( 9224 dataframe_findbypipeline.apply( 9225 lambda row: findbypipeline( 9226 row, samples=self.get_header_sample_list() 9227 ), 9228 axis=1, 9229 ) 9230 ) 9231 9232 # Add snpeff_hgvs to header 9233 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9234 findbypipeline_tag, 9235 ".", 9236 "String", 9237 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9238 "howard calculation", 9239 "0", 9240 self.code_type_map.get("String"), 9241 ) 9242 9243 # Update 9244 sql_update = f""" 9245 UPDATE variants 9246 SET "INFO" = 9247 concat( 9248 CASE 9249 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9250 THEN '' 9251 ELSE concat("INFO", ';') 9252 END, 9253 CASE 9254 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9255 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9256 THEN concat( 9257 '{findbypipeline_tag}=', 9258 dataframe_findbypipeline."{findbypipeline_infos}" 9259 ) 9260 ELSE '' 9261 END 9262 ) 9263 FROM dataframe_findbypipeline 9264 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9265 """ 9266 self.conn.execute(sql_update) 9267 9268 # Remove added columns 9269 for added_column in added_columns: 9270 self.drop_column(column=added_column) 9271 9272 # Delete dataframe 9273 del dataframe_findbypipeline 9274 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9276 def calculation_genotype_concordance(self) -> None: 9277 """ 9278 The function `calculation_genotype_concordance` calculates the genotype concordance for 9279 multi-caller VCF files and updates the variant information in the database. 9280 """ 9281 9282 # if FORMAT and samples 9283 if ( 9284 "FORMAT" in self.get_header_columns_as_list() 9285 and self.get_header_sample_list() 9286 ): 9287 9288 # genotypeconcordance annotation field 9289 genotypeconcordance_tag = "genotypeconcordance" 9290 9291 # VCF infos tags 9292 vcf_infos_tags = { 9293 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9294 } 9295 9296 # Prefix 9297 prefix = self.get_explode_infos_prefix() 9298 9299 # Field 9300 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9301 9302 # Variants table 9303 table_variants = self.get_table_variants() 9304 9305 # Header 9306 vcf_reader = self.get_header() 9307 9308 # Create variant id 9309 variant_id_column = self.get_variant_id_column() 9310 added_columns = [variant_id_column] 9311 9312 # variant_id, FORMAT and samples 9313 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9314 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9315 ) 9316 9317 # Create dataframe 9318 dataframe_genotypeconcordance = self.get_query_to_df( 9319 f""" SELECT {samples_fields} FROM {table_variants} """ 9320 ) 9321 9322 # Create genotypeconcordance column 9323 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9324 dataframe_genotypeconcordance.apply( 9325 lambda row: genotypeconcordance( 9326 row, samples=self.get_header_sample_list() 9327 ), 9328 axis=1, 9329 ) 9330 ) 9331 9332 # Add genotypeconcordance to header 9333 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9334 genotypeconcordance_tag, 9335 ".", 9336 "String", 9337 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9338 "howard calculation", 9339 "0", 9340 self.code_type_map.get("String"), 9341 ) 9342 9343 # Update 9344 sql_update = f""" 9345 UPDATE variants 9346 SET "INFO" = 9347 concat( 9348 CASE 9349 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9350 THEN '' 9351 ELSE concat("INFO", ';') 9352 END, 9353 CASE 9354 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9355 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9356 THEN concat( 9357 '{genotypeconcordance_tag}=', 9358 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9359 ) 9360 ELSE '' 9361 END 9362 ) 9363 FROM dataframe_genotypeconcordance 9364 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9365 """ 9366 self.conn.execute(sql_update) 9367 9368 # Remove added columns 9369 for added_column in added_columns: 9370 self.drop_column(column=added_column) 9371 9372 # Delete dataframe 9373 del dataframe_genotypeconcordance 9374 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9376 def calculation_barcode(self, tag: str = "barcode") -> None: 9377 """ 9378 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9379 updates the INFO field in the file with the calculated barcode values. 9380 9381 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9382 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9383 the default tag name is set to "barcode", defaults to barcode 9384 :type tag: str (optional) 9385 """ 9386 9387 # if FORMAT and samples 9388 if ( 9389 "FORMAT" in self.get_header_columns_as_list() 9390 and self.get_header_sample_list() 9391 ): 9392 9393 # barcode annotation field 9394 if not tag: 9395 tag = "barcode" 9396 9397 # VCF infos tags 9398 vcf_infos_tags = { 9399 tag: "barcode calculation (VaRank)", 9400 } 9401 9402 # Prefix 9403 prefix = self.get_explode_infos_prefix() 9404 9405 # Field 9406 barcode_infos = prefix + tag 9407 9408 # Variants table 9409 table_variants = self.get_table_variants() 9410 9411 # Header 9412 vcf_reader = self.get_header() 9413 9414 # Create variant id 9415 variant_id_column = self.get_variant_id_column() 9416 added_columns = [variant_id_column] 9417 9418 # variant_id, FORMAT and samples 9419 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9420 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9421 ) 9422 9423 # Create dataframe 9424 dataframe_barcode = self.get_query_to_df( 9425 f""" SELECT {samples_fields} FROM {table_variants} """ 9426 ) 9427 9428 # Create barcode column 9429 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9430 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9431 ) 9432 9433 # Add barcode to header 9434 vcf_reader.infos[tag] = vcf.parser._Info( 9435 tag, 9436 ".", 9437 "String", 9438 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9439 "howard calculation", 9440 "0", 9441 self.code_type_map.get("String"), 9442 ) 9443 9444 # Update 9445 sql_update = f""" 9446 UPDATE {table_variants} 9447 SET "INFO" = 9448 concat( 9449 CASE 9450 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9451 THEN '' 9452 ELSE concat("INFO", ';') 9453 END, 9454 CASE 9455 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9456 AND dataframe_barcode."{barcode_infos}" NOT NULL 9457 THEN concat( 9458 '{tag}=', 9459 dataframe_barcode."{barcode_infos}" 9460 ) 9461 ELSE '' 9462 END 9463 ) 9464 FROM dataframe_barcode 9465 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9466 """ 9467 self.conn.execute(sql_update) 9468 9469 # Remove added columns 9470 for added_column in added_columns: 9471 self.drop_column(column=added_column) 9472 9473 # Delete dataframe 9474 del dataframe_barcode 9475 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9477 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9478 """ 9479 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9480 and updates the INFO field in the file with the calculated barcode values. 9481 9482 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9483 the barcode tag that will be added to the VCF file during the calculation process. If no value 9484 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9485 :type tag: str (optional) 9486 """ 9487 9488 # if FORMAT and samples 9489 if ( 9490 "FORMAT" in self.get_header_columns_as_list() 9491 and self.get_header_sample_list() 9492 ): 9493 9494 # barcode annotation field 9495 if not tag: 9496 tag = "BCF" 9497 9498 # VCF infos tags 9499 vcf_infos_tags = { 9500 tag: "barcode family calculation", 9501 f"{tag}S": "barcode family samples", 9502 } 9503 9504 # Param 9505 param = self.get_param() 9506 log.debug(f"param={param}") 9507 9508 # Prefix 9509 prefix = self.get_explode_infos_prefix() 9510 9511 # PED param 9512 ped = ( 9513 param.get("calculation", {}) 9514 .get("calculations", {}) 9515 .get("BARCODEFAMILY", {}) 9516 .get("family_pedigree", None) 9517 ) 9518 log.debug(f"ped={ped}") 9519 9520 # Load PED 9521 if ped: 9522 9523 # Pedigree is a file 9524 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9525 log.debug("Pedigree is file") 9526 with open(full_path(ped)) as ped: 9527 ped = yaml.safe_load(ped) 9528 9529 # Pedigree is a string 9530 elif isinstance(ped, str): 9531 log.debug("Pedigree is str") 9532 try: 9533 ped = json.loads(ped) 9534 log.debug("Pedigree is json str") 9535 except ValueError as e: 9536 ped_samples = ped.split(",") 9537 ped = {} 9538 for ped_sample in ped_samples: 9539 ped[ped_sample] = ped_sample 9540 9541 # Pedigree is a dict 9542 elif isinstance(ped, dict): 9543 log.debug("Pedigree is dict") 9544 9545 # Pedigree is not well formatted 9546 else: 9547 msg_error = "Pedigree not well formatted" 9548 log.error(msg_error) 9549 raise ValueError(msg_error) 9550 9551 # Construct list 9552 ped_samples = list(ped.values()) 9553 9554 else: 9555 log.debug("Pedigree not defined. Take all samples") 9556 ped_samples = self.get_header_sample_list() 9557 ped = {} 9558 for ped_sample in ped_samples: 9559 ped[ped_sample] = ped_sample 9560 9561 # Check pedigree 9562 if not ped or len(ped) == 0: 9563 msg_error = f"Error in pedigree: samples {ped_samples}" 9564 log.error(msg_error) 9565 raise ValueError(msg_error) 9566 9567 # Log 9568 log.info( 9569 "Calculation 'BARCODEFAMILY' - Samples: " 9570 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9571 ) 9572 log.debug(f"ped_samples={ped_samples}") 9573 9574 # Field 9575 barcode_infos = prefix + tag 9576 9577 # Variants table 9578 table_variants = self.get_table_variants() 9579 9580 # Header 9581 vcf_reader = self.get_header() 9582 9583 # Create variant id 9584 variant_id_column = self.get_variant_id_column() 9585 added_columns = [variant_id_column] 9586 9587 # variant_id, FORMAT and samples 9588 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9589 [f""" "{sample}" """ for sample in ped_samples] 9590 ) 9591 9592 # Create dataframe 9593 dataframe_barcode = self.get_query_to_df( 9594 f""" SELECT {samples_fields} FROM {table_variants} """ 9595 ) 9596 9597 # Create barcode column 9598 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9599 lambda row: barcode(row, samples=ped_samples), axis=1 9600 ) 9601 9602 # Add barcode family to header 9603 # Add vaf_normalization to header 9604 vcf_reader.formats[tag] = vcf.parser._Format( 9605 id=tag, 9606 num=".", 9607 type="String", 9608 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9609 type_code=self.code_type_map.get("String"), 9610 ) 9611 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9612 id=f"{tag}S", 9613 num=".", 9614 type="String", 9615 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9616 type_code=self.code_type_map.get("String"), 9617 ) 9618 9619 # Update 9620 # for sample in ped_samples: 9621 sql_update_set = [] 9622 for sample in self.get_header_sample_list() + ["FORMAT"]: 9623 if sample in ped_samples: 9624 value = f'dataframe_barcode."{barcode_infos}"' 9625 value_samples = ( 9626 "'" 9627 + ",".join([f""" "{sample}" """ for sample in ped_samples]) 9628 + "'" 9629 ) 9630 ped_samples 9631 elif sample == "FORMAT": 9632 value = f"'{tag}'" 9633 value_samples = f"'{tag}S'" 9634 else: 9635 value = "'.'" 9636 value_samples = "'.'" 9637 format_regex = r"[a-zA-Z0-9\s]" 9638 sql_update_set.append( 9639 f""" 9640 "{sample}" = 9641 concat( 9642 CASE 9643 WHEN {table_variants}."{sample}" = './.' 9644 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9645 ELSE {table_variants}."{sample}" 9646 END, 9647 ':', 9648 {value}, 9649 ':', 9650 {value_samples} 9651 ) 9652 """ 9653 ) 9654 9655 sql_update_set_join = ", ".join(sql_update_set) 9656 sql_update = f""" 9657 UPDATE {table_variants} 9658 SET {sql_update_set_join} 9659 FROM dataframe_barcode 9660 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9661 """ 9662 self.conn.execute(sql_update) 9663 9664 # Remove added columns 9665 for added_column in added_columns: 9666 self.drop_column(column=added_column) 9667 9668 # Delete dataframe 9669 del dataframe_barcode 9670 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9672 def calculation_trio(self) -> None: 9673 """ 9674 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9675 information to the INFO field of each variant. 9676 """ 9677 9678 # if FORMAT and samples 9679 if ( 9680 "FORMAT" in self.get_header_columns_as_list() 9681 and self.get_header_sample_list() 9682 ): 9683 9684 # trio annotation field 9685 trio_tag = "trio" 9686 9687 # VCF infos tags 9688 vcf_infos_tags = { 9689 "trio": "trio calculation", 9690 } 9691 9692 # Param 9693 param = self.get_param() 9694 9695 # Prefix 9696 prefix = self.get_explode_infos_prefix() 9697 9698 # Trio param 9699 trio_ped = ( 9700 param.get("calculation", {}) 9701 .get("calculations", {}) 9702 .get("TRIO", {}) 9703 .get("trio_pedigree", None) 9704 ) 9705 9706 # Load trio 9707 if trio_ped: 9708 9709 # Trio pedigree is a file 9710 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9711 log.debug("TRIO pedigree is file") 9712 with open(full_path(trio_ped)) as trio_ped: 9713 trio_ped = yaml.safe_load(trio_ped) 9714 9715 # Trio pedigree is a string 9716 elif isinstance(trio_ped, str): 9717 log.debug("TRIO pedigree is str") 9718 try: 9719 trio_ped = json.loads(trio_ped) 9720 log.debug("TRIO pedigree is json str") 9721 except ValueError as e: 9722 trio_samples = trio_ped.split(",") 9723 if len(trio_samples) == 3: 9724 trio_ped = { 9725 "father": trio_samples[0], 9726 "mother": trio_samples[1], 9727 "child": trio_samples[2], 9728 } 9729 log.debug("TRIO pedigree is list str") 9730 else: 9731 msg_error = "TRIO pedigree not well formatted" 9732 log.error(msg_error) 9733 raise ValueError(msg_error) 9734 9735 # Trio pedigree is a dict 9736 elif isinstance(trio_ped, dict): 9737 log.debug("TRIO pedigree is dict") 9738 9739 # Trio pedigree is not well formatted 9740 else: 9741 msg_error = "TRIO pedigree not well formatted" 9742 log.error(msg_error) 9743 raise ValueError(msg_error) 9744 9745 # Construct trio list 9746 trio_samples = [ 9747 trio_ped.get("father", ""), 9748 trio_ped.get("mother", ""), 9749 trio_ped.get("child", ""), 9750 ] 9751 9752 else: 9753 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9754 samples_list = self.get_header_sample_list() 9755 if len(samples_list) >= 3: 9756 trio_samples = self.get_header_sample_list()[0:3] 9757 trio_ped = { 9758 "father": trio_samples[0], 9759 "mother": trio_samples[1], 9760 "child": trio_samples[2], 9761 } 9762 else: 9763 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9764 log.error(msg_error) 9765 raise ValueError(msg_error) 9766 9767 # Check trio pedigree 9768 if not trio_ped or len(trio_ped) != 3: 9769 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9770 log.error(msg_error) 9771 raise ValueError(msg_error) 9772 9773 # Log 9774 log.info( 9775 f"Calculation 'TRIO' - Samples: " 9776 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9777 ) 9778 9779 # Field 9780 trio_infos = prefix + trio_tag 9781 9782 # Variants table 9783 table_variants = self.get_table_variants() 9784 9785 # Header 9786 vcf_reader = self.get_header() 9787 9788 # Create variant id 9789 variant_id_column = self.get_variant_id_column() 9790 added_columns = [variant_id_column] 9791 9792 # variant_id, FORMAT and samples 9793 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9794 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9795 ) 9796 9797 # Create dataframe 9798 dataframe_trio = self.get_query_to_df( 9799 f""" SELECT {samples_fields} FROM {table_variants} """ 9800 ) 9801 9802 # Create trio column 9803 dataframe_trio[trio_infos] = dataframe_trio.apply( 9804 lambda row: trio(row, samples=trio_samples), axis=1 9805 ) 9806 9807 # Add trio to header 9808 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9809 trio_tag, 9810 ".", 9811 "String", 9812 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9813 "howard calculation", 9814 "0", 9815 self.code_type_map.get("String"), 9816 ) 9817 9818 # Update 9819 sql_update = f""" 9820 UPDATE {table_variants} 9821 SET "INFO" = 9822 concat( 9823 CASE 9824 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9825 THEN '' 9826 ELSE concat("INFO", ';') 9827 END, 9828 CASE 9829 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9830 AND dataframe_trio."{trio_infos}" NOT NULL 9831 THEN concat( 9832 '{trio_tag}=', 9833 dataframe_trio."{trio_infos}" 9834 ) 9835 ELSE '' 9836 END 9837 ) 9838 FROM dataframe_trio 9839 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9840 """ 9841 self.conn.execute(sql_update) 9842 9843 # Remove added columns 9844 for added_column in added_columns: 9845 self.drop_column(column=added_column) 9846 9847 # Delete dataframe 9848 del dataframe_trio 9849 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9851 def calculation_vaf_normalization(self) -> None: 9852 """ 9853 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9854 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9855 :return: The function does not return anything. 9856 """ 9857 9858 # if FORMAT and samples 9859 if ( 9860 "FORMAT" in self.get_header_columns_as_list() 9861 and self.get_header_sample_list() 9862 ): 9863 9864 # vaf_normalization annotation field 9865 vaf_normalization_tag = "VAF" 9866 9867 # VCF infos tags 9868 vcf_infos_tags = { 9869 "VAF": "VAF Variant Frequency", 9870 } 9871 9872 # Prefix 9873 prefix = self.get_explode_infos_prefix() 9874 9875 # Variants table 9876 table_variants = self.get_table_variants() 9877 9878 # Header 9879 vcf_reader = self.get_header() 9880 9881 # Do not calculate if VAF already exists 9882 if "VAF" in vcf_reader.formats: 9883 log.debug("VAF already on genotypes") 9884 return 9885 9886 # Create variant id 9887 variant_id_column = self.get_variant_id_column() 9888 added_columns = [variant_id_column] 9889 9890 # variant_id, FORMAT and samples 9891 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9892 f""" "{sample}" """ for sample in self.get_header_sample_list() 9893 ) 9894 9895 # Create dataframe 9896 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9897 log.debug(f"query={query}") 9898 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9899 9900 vaf_normalization_set = [] 9901 9902 # for each sample vaf_normalization 9903 for sample in self.get_header_sample_list(): 9904 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9905 lambda row: vaf_normalization(row, sample=sample), axis=1 9906 ) 9907 vaf_normalization_set.append( 9908 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9909 ) 9910 9911 # Add VAF to FORMAT 9912 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9913 "FORMAT" 9914 ].apply(lambda x: str(x) + ":VAF") 9915 vaf_normalization_set.append( 9916 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9917 ) 9918 9919 # Add vaf_normalization to header 9920 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9921 id=vaf_normalization_tag, 9922 num="1", 9923 type="Float", 9924 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9925 type_code=self.code_type_map.get("Float"), 9926 ) 9927 9928 # Create fields to add in INFO 9929 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9930 9931 # Update 9932 sql_update = f""" 9933 UPDATE {table_variants} 9934 SET {sql_vaf_normalization_set} 9935 FROM dataframe_vaf_normalization 9936 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9937 9938 """ 9939 self.conn.execute(sql_update) 9940 9941 # Remove added columns 9942 for added_column in added_columns: 9943 self.drop_column(column=added_column) 9944 9945 # Delete dataframe 9946 del dataframe_vaf_normalization 9947 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9949 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9950 """ 9951 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9952 field in a VCF file and updates the INFO column of the variants table with the calculated 9953 statistics. 9954 9955 :param info: The `info` parameter is a string that represents the type of information for which 9956 genotype statistics are calculated. It is used to generate various VCF info tags for the 9957 statistics, such as the number of occurrences, the list of values, the minimum value, the 9958 maximum value, the mean, the median, defaults to VAF 9959 :type info: str (optional) 9960 """ 9961 9962 # if FORMAT and samples 9963 if ( 9964 "FORMAT" in self.get_header_columns_as_list() 9965 and self.get_header_sample_list() 9966 ): 9967 9968 # vaf_stats annotation field 9969 vaf_stats_tag = info + "_stats" 9970 9971 # VCF infos tags 9972 vcf_infos_tags = { 9973 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9974 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9975 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9976 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9977 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9978 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9979 info 9980 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9981 } 9982 9983 # Prefix 9984 prefix = self.get_explode_infos_prefix() 9985 9986 # Field 9987 vaf_stats_infos = prefix + vaf_stats_tag 9988 9989 # Variants table 9990 table_variants = self.get_table_variants() 9991 9992 # Header 9993 vcf_reader = self.get_header() 9994 9995 # Create variant id 9996 variant_id_column = self.get_variant_id_column() 9997 added_columns = [variant_id_column] 9998 9999 # variant_id, FORMAT and samples 10000 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 10001 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 10002 ) 10003 10004 # Create dataframe 10005 dataframe_vaf_stats = self.get_query_to_df( 10006 f""" SELECT {samples_fields} FROM {table_variants} """ 10007 ) 10008 10009 # Create vaf_stats column 10010 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 10011 lambda row: genotype_stats( 10012 row, samples=self.get_header_sample_list(), info=info 10013 ), 10014 axis=1, 10015 ) 10016 10017 # List of vcf tags 10018 sql_vaf_stats_fields = [] 10019 10020 # Check all VAF stats infos 10021 for stat in vcf_infos_tags: 10022 10023 # Extract stats 10024 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 10025 lambda x: dict(x).get(stat, "") 10026 ) 10027 10028 # Add snpeff_hgvs to header 10029 vcf_reader.infos[stat] = vcf.parser._Info( 10030 stat, 10031 ".", 10032 "String", 10033 vcf_infos_tags.get(stat, "genotype statistics"), 10034 "howard calculation", 10035 "0", 10036 self.code_type_map.get("String"), 10037 ) 10038 10039 if len(sql_vaf_stats_fields): 10040 sep = ";" 10041 else: 10042 sep = "" 10043 10044 # Create fields to add in INFO 10045 sql_vaf_stats_fields.append( 10046 f""" 10047 CASE 10048 WHEN dataframe_vaf_stats."{stat}" NOT NULL 10049 THEN concat( 10050 '{sep}{stat}=', 10051 dataframe_vaf_stats."{stat}" 10052 ) 10053 ELSE '' 10054 END 10055 """ 10056 ) 10057 10058 # SQL set for update 10059 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 10060 10061 # Update 10062 sql_update = f""" 10063 UPDATE {table_variants} 10064 SET "INFO" = 10065 concat( 10066 CASE 10067 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10068 THEN '' 10069 ELSE concat("INFO", ';') 10070 END, 10071 {sql_vaf_stats_fields_set} 10072 ) 10073 FROM dataframe_vaf_stats 10074 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 10075 10076 """ 10077 self.conn.execute(sql_update) 10078 10079 # Remove added columns 10080 for added_column in added_columns: 10081 self.drop_column(column=added_column) 10082 10083 # Delete dataframe 10084 del dataframe_vaf_stats 10085 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
10087 def calculation_transcripts_annotation( 10088 self, info_json: str = None, info_format: str = None 10089 ) -> None: 10090 """ 10091 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10092 field to it if transcripts are available. 10093 10094 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10095 is a string parameter that represents the information field to be used in the transcripts JSON. 10096 It is used to specify the JSON format for the transcripts information. If no value is provided 10097 when calling the method, it defaults to " 10098 :type info_json: str 10099 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10100 method is a string parameter that specifies the format of the information field to be used in 10101 the transcripts JSON. It is used to define the format of the information field 10102 :type info_format: str 10103 """ 10104 10105 # Create transcripts table 10106 transcripts_table = self.create_transcript_view() 10107 10108 # Add info field 10109 if transcripts_table: 10110 self.transcript_view_to_variants( 10111 transcripts_table=transcripts_table, 10112 transcripts_info_field_json=info_json, 10113 transcripts_info_field_format=info_format, 10114 ) 10115 else: 10116 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
10118 def calculation_transcripts_prioritization(self) -> None: 10119 """ 10120 The function `calculation_transcripts_prioritization` creates a transcripts table and 10121 prioritizes transcripts based on certain criteria. 10122 """ 10123 10124 # Create transcripts table 10125 transcripts_table = self.create_transcript_view() 10126 10127 # Add info field 10128 if transcripts_table: 10129 self.transcripts_prioritization(transcripts_table=transcripts_table) 10130 else: 10131 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
10133 def calculation_transcripts_export(self) -> None: 10134 """ """ 10135 10136 # Create transcripts table 10137 transcripts_table = self.create_transcript_view() 10138 10139 # Add info field 10140 if transcripts_table: 10141 self.transcripts_export(transcripts_table=transcripts_table) 10142 else: 10143 log.info("No Transcripts to process. Check param.json file configuration")
10149 def transcripts_export( 10150 self, transcripts_table: str = None, param: dict = {} 10151 ) -> bool: 10152 """ """ 10153 10154 log.debug("Start transcripts export...") 10155 10156 # Param 10157 if not param: 10158 param = self.get_param() 10159 10160 # Param export 10161 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10162 10163 # Output file 10164 transcripts_export_output = param_transcript_export.get("output", None) 10165 10166 if not param_transcript_export or not transcripts_export_output: 10167 log.warning(f"No transcriipts export parameters defined!") 10168 return False 10169 10170 # List of transcripts annotations 10171 query_describe = f""" 10172 SELECT column_name 10173 FROM ( 10174 DESCRIBE SELECT * FROM {transcripts_table} 10175 ) 10176 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10177 """ 10178 transcripts_annotations_list = list( 10179 self.get_query_to_df(query=query_describe)["column_name"] 10180 ) 10181 10182 # Create transcripts table for export 10183 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10184 random.choices(string.ascii_uppercase + string.digits, k=10) 10185 ) 10186 query_create_transcripts_table_export = f""" 10187 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10188 """ 10189 self.execute_query(query=query_create_transcripts_table_export) 10190 10191 # Output file format 10192 transcripts_export_output_format = get_file_format( 10193 filename=transcripts_export_output 10194 ) 10195 10196 # Format VCF - construct INFO 10197 if transcripts_export_output_format in ["vcf"]: 10198 10199 # Construct query update INFO and header 10200 query_update_info = [] 10201 for field in transcripts_annotations_list: 10202 10203 # If field not in header 10204 if field not in self.get_header_infos_list(): 10205 10206 # Add PZ Transcript in header 10207 self.get_header().infos[field] = vcf.parser._Info( 10208 field, 10209 ".", 10210 "String", 10211 f"Annotation '{field}' from transcript view", 10212 "unknown", 10213 "unknown", 10214 0, 10215 ) 10216 10217 # Add field as INFO/tag 10218 query_update_info.append( 10219 f""" 10220 CASE 10221 WHEN "{field}" IS NOT NULL 10222 THEN concat('{field}=', "{field}", ';') 10223 ELSE '' 10224 END 10225 """ 10226 ) 10227 10228 # Query param 10229 query_update_info_value = ( 10230 f""" concat('', {", ".join(query_update_info)}) """ 10231 ) 10232 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10233 10234 else: 10235 10236 # Query param 10237 query_update_info_value = f""" NULL """ 10238 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10239 10240 # Update query INFO column 10241 query_update = f""" 10242 UPDATE {transcripts_table_export} 10243 SET INFO = {query_update_info_value} 10244 10245 """ 10246 self.execute_query(query=query_update) 10247 10248 # Export 10249 self.export_output( 10250 output_file=transcripts_export_output, 10251 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10252 ) 10253 10254 # Drop transcripts export table 10255 query_drop_transcripts_table_export = f""" 10256 DROP TABLE {transcripts_table_export} 10257 """ 10258 self.execute_query(query=query_drop_transcripts_table_export)
10260 def transcripts_prioritization( 10261 self, transcripts_table: str = None, param: dict = {} 10262 ) -> bool: 10263 """ 10264 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10265 and updates the variants table with the prioritized information. 10266 10267 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10268 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10269 This parameter is used to identify the table where the transcripts data is stored for the 10270 prioritization process 10271 :type transcripts_table: str 10272 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10273 that contains various configuration settings for the prioritization process of transcripts. It 10274 is used to customize the behavior of the prioritization algorithm and includes settings such as 10275 the prefix for prioritization fields, default profiles, and other 10276 :type param: dict 10277 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10278 transcripts prioritization process is successfully completed, and `False` if there are any 10279 issues or if no profile is defined for transcripts prioritization. 10280 """ 10281 10282 log.debug("Start transcripts prioritization...") 10283 10284 # Param 10285 if not param: 10286 param = self.get_param() 10287 10288 # Variants table 10289 table_variants = self.get_table_variants() 10290 10291 # Transcripts table 10292 if transcripts_table is None: 10293 transcripts_table = self.create_transcript_view( 10294 transcripts_table="transcripts", param=param 10295 ) 10296 if transcripts_table is None: 10297 msg_err = "No Transcripts table availalble" 10298 log.error(msg_err) 10299 raise ValueError(msg_err) 10300 log.debug(f"transcripts_table={transcripts_table}") 10301 10302 # Get transcripts columns 10303 columns_as_list_query = f""" 10304 DESCRIBE {transcripts_table} 10305 """ 10306 columns_as_list = list( 10307 self.get_query_to_df(columns_as_list_query)["column_name"] 10308 ) 10309 10310 # Create INFO if not exists 10311 if "INFO" not in columns_as_list: 10312 query_add_info = f""" 10313 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10314 """ 10315 self.execute_query(query_add_info) 10316 10317 # Prioritization param and Force only PZ Score and Flag 10318 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10319 10320 # PZ profile by default 10321 pz_profile_default = ( 10322 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10323 ) 10324 10325 # Exit if no profile 10326 if pz_profile_default is None: 10327 log.warning("No profile defined for transcripts prioritization") 10328 return False 10329 10330 # PZ fields 10331 pz_param_pzfields = {} 10332 10333 # PZ field transcripts 10334 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10335 10336 # Add PZ Transcript in header 10337 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10338 pz_fields_transcripts, 10339 ".", 10340 "String", 10341 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10342 "unknown", 10343 "unknown", 10344 code_type_map["String"], 10345 ) 10346 10347 # Mandatory fields if asked in param 10348 pz_mandatory_fields_list = [ 10349 "Score", 10350 "Flag", 10351 "Tags", 10352 "Comment", 10353 "Infos", 10354 "Class", 10355 ] 10356 pz_mandatory_fields = [] 10357 for pz_mandatory_field in pz_mandatory_fields_list: 10358 pz_mandatory_fields.append( 10359 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10360 ) 10361 10362 # PZ fields in param 10363 pz_param_mandatory_fields = [] 10364 for pz_field in pz_param.get("pzfields", []): 10365 if pz_field in pz_mandatory_fields_list: 10366 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10367 pz_param.get("pzprefix", "PTZ") + pz_field 10368 ) 10369 pz_param_mandatory_fields.append( 10370 pz_param.get("pzprefix", "PTZ") + pz_field 10371 ) 10372 else: 10373 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10374 pz_param_pzfields[pz_field] = pz_field_new 10375 10376 # Add PZ Transcript in header 10377 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10378 pz_field_new, 10379 ".", 10380 "String", 10381 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10382 "unknown", 10383 "unknown", 10384 code_type_map["String"], 10385 ) 10386 10387 # PZ fields param 10388 pz_mandatory_fields = pz_param_mandatory_fields 10389 pz_param["pzfields"] = pz_mandatory_fields 10390 10391 # Prioritization 10392 prioritization_result = self.prioritization( 10393 table=transcripts_table, 10394 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10395 ) 10396 if not prioritization_result: 10397 log.warning("Transcripts prioritization not processed") 10398 return False 10399 10400 # PZ fields sql query 10401 query_update_select_list = [] 10402 query_update_concat_list = [] 10403 query_update_order_list = [] 10404 for pz_param_pzfield in set( 10405 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10406 ): 10407 query_update_select_list.append(f" {pz_param_pzfield}, ") 10408 10409 for pz_param_pzfield in pz_param_pzfields: 10410 query_update_concat_list.append( 10411 f""" 10412 , CASE 10413 WHEN {pz_param_pzfield} IS NOT NULL 10414 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10415 ELSE '' 10416 END 10417 """ 10418 ) 10419 10420 # Order by 10421 pz_orders = ( 10422 param.get("transcripts", {}) 10423 .get("prioritization", {}) 10424 .get("prioritization_transcripts_order", {}) 10425 ) 10426 if not pz_orders: 10427 pz_orders = { 10428 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10429 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10430 } 10431 for pz_order in pz_orders: 10432 query_update_order_list.append( 10433 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10434 ) 10435 10436 # Fields to explode 10437 fields_to_explode = ( 10438 list(pz_param_pzfields.keys()) 10439 + pz_mandatory_fields 10440 + list(pz_orders.keys()) 10441 ) 10442 # Remove transcript column as a specific transcript column 10443 if "transcript" in fields_to_explode: 10444 fields_to_explode.remove("transcript") 10445 10446 # Fields intranscripts table 10447 query_transcripts_table = f""" 10448 DESCRIBE SELECT * FROM {transcripts_table} 10449 """ 10450 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10451 10452 # Check fields to explode 10453 for field_to_explode in fields_to_explode: 10454 if field_to_explode not in self.get_header_infos_list() + list( 10455 query_transcripts_table.column_name 10456 ): 10457 msg_err = f"INFO/{field_to_explode} NOT IN header" 10458 log.error(msg_err) 10459 raise ValueError(msg_err) 10460 10461 # Explode fields to explode 10462 self.explode_infos( 10463 table=transcripts_table, 10464 fields=fields_to_explode, 10465 ) 10466 10467 # Transcript preference file 10468 transcripts_preference_file = ( 10469 param.get("transcripts", {}) 10470 .get("prioritization", {}) 10471 .get("prioritization_transcripts", {}) 10472 ) 10473 transcripts_preference_file = full_path(transcripts_preference_file) 10474 10475 # Transcript preference forced 10476 transcript_preference_force = ( 10477 param.get("transcripts", {}) 10478 .get("prioritization", {}) 10479 .get("prioritization_transcripts_force", False) 10480 ) 10481 # Transcript version forced 10482 transcript_version_force = ( 10483 param.get("transcripts", {}) 10484 .get("prioritization", {}) 10485 .get("prioritization_transcripts_version_force", False) 10486 ) 10487 10488 # Transcripts Ranking 10489 if transcripts_preference_file: 10490 10491 # Transcripts file to dataframe 10492 if os.path.exists(transcripts_preference_file): 10493 transcripts_preference_dataframe = transcripts_file_to_df( 10494 transcripts_preference_file 10495 ) 10496 else: 10497 log.error( 10498 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10499 ) 10500 raise ValueError( 10501 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10502 ) 10503 10504 # Order by depending to transcript preference forcing 10505 if transcript_preference_force: 10506 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10507 else: 10508 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10509 10510 # Transcript columns joined depend on version consideration 10511 if transcript_version_force: 10512 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10513 else: 10514 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10515 10516 # Query ranking for update 10517 query_update_ranking = f""" 10518 SELECT 10519 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10520 ROW_NUMBER() OVER ( 10521 PARTITION BY "#CHROM", POS, REF, ALT 10522 ORDER BY {order_by} 10523 ) AS rn 10524 FROM {transcripts_table} 10525 LEFT JOIN 10526 ( 10527 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10528 FROM transcripts_preference_dataframe 10529 ) AS transcripts_preference 10530 ON {transcripts_version_join} 10531 """ 10532 10533 else: 10534 10535 # Query ranking for update 10536 query_update_ranking = f""" 10537 SELECT 10538 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10539 ROW_NUMBER() OVER ( 10540 PARTITION BY "#CHROM", POS, REF, ALT 10541 ORDER BY {" , ".join(query_update_order_list)} 10542 ) AS rn 10543 FROM {transcripts_table} 10544 """ 10545 10546 # Export Transcripts prioritization infos to variants table 10547 query_update = f""" 10548 WITH RankedTranscripts AS ( 10549 {query_update_ranking} 10550 ) 10551 UPDATE {table_variants} 10552 SET 10553 INFO = CONCAT(CASE 10554 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10555 THEN '' 10556 ELSE concat("INFO", ';') 10557 END, 10558 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10559 ) 10560 FROM 10561 RankedTranscripts 10562 WHERE 10563 rn = 1 10564 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10565 AND variants."POS" = RankedTranscripts."POS" 10566 AND variants."REF" = RankedTranscripts."REF" 10567 AND variants."ALT" = RankedTranscripts."ALT" 10568 """ 10569 10570 # log.debug(f"query_update={query_update}") 10571 self.execute_query(query=query_update) 10572 10573 # Return 10574 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10576 def create_transcript_view_from_columns_map( 10577 self, 10578 transcripts_table: str = "transcripts", 10579 columns_maps: dict = {}, 10580 added_columns: list = [], 10581 temporary_tables: list = None, 10582 annotation_fields: list = None, 10583 column_rename: dict = {}, 10584 column_clean: bool = False, 10585 column_case: str = None, 10586 ) -> tuple[list, list, list]: 10587 """ 10588 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10589 specified columns mapping for transcripts data. 10590 10591 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10592 of the table where the transcripts data is stored or will be stored in the database. This table 10593 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10594 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10595 :type transcripts_table: str (optional) 10596 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10597 about how to map columns from a transcripts table to create a view. Each entry in the 10598 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10599 typically includes details such as the main transcript column and additional information columns 10600 :type columns_maps: dict 10601 :param added_columns: The `added_columns` parameter in the 10602 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10603 that will be added to the view being created based on the columns map provided. These columns 10604 are generated by exploding the transcript information columns along with the main transcript 10605 column 10606 :type added_columns: list 10607 :param temporary_tables: The `temporary_tables` parameter in the 10608 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10609 tables created during the process of creating a transcript view from a columns map. These 10610 temporary tables are used to store intermediate results or transformations before the final view 10611 is generated 10612 :type temporary_tables: list 10613 :param annotation_fields: The `annotation_fields` parameter in the 10614 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10615 used for annotation in the query view creation process. These fields are extracted from the 10616 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10617 :type annotation_fields: list 10618 :param column_rename: The `column_rename` parameter in the 10619 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10620 custom renaming for columns during the creation of the temporary table view. This parameter 10621 provides a mapping of original column names to the desired renamed column names. By using this 10622 parameter, 10623 :type column_rename: dict 10624 :param column_clean: The `column_clean` parameter in the 10625 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10626 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10627 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10628 False 10629 :type column_clean: bool (optional) 10630 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10631 function is used to specify the case transformation to be applied to the columns during the view 10632 creation process. It allows you to control whether the column values should be converted to 10633 lowercase, uppercase, or remain unchanged 10634 :type column_case: str 10635 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10636 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10637 """ 10638 10639 log.debug("Start transcrpts view creation from columns map...") 10640 10641 # "from_columns_map": [ 10642 # { 10643 # "transcripts_column": "Ensembl_transcriptid", 10644 # "transcripts_infos_columns": [ 10645 # "genename", 10646 # "Ensembl_geneid", 10647 # "LIST_S2_score", 10648 # "LIST_S2_pred", 10649 # ], 10650 # }, 10651 # { 10652 # "transcripts_column": "Ensembl_transcriptid", 10653 # "transcripts_infos_columns": [ 10654 # "genename", 10655 # "VARITY_R_score", 10656 # "Aloft_pred", 10657 # ], 10658 # }, 10659 # ], 10660 10661 # Init 10662 if temporary_tables is None: 10663 temporary_tables = [] 10664 if annotation_fields is None: 10665 annotation_fields = [] 10666 10667 # Variants table 10668 table_variants = self.get_table_variants() 10669 10670 for columns_map in columns_maps: 10671 10672 # Log 10673 log.debug(f"columns_map={columns_map}") 10674 10675 # Transcript column 10676 transcripts_column = columns_map.get("transcripts_column", None) 10677 10678 # Transcripts infos columns 10679 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10680 10681 # Transcripts infos columns rename 10682 column_rename = columns_map.get("column_rename", column_rename) 10683 10684 # Transcripts infos columns clean 10685 column_clean = columns_map.get("column_clean", column_clean) 10686 10687 # Transcripts infos columns case 10688 column_case = columns_map.get("column_case", column_case) 10689 10690 if transcripts_column is not None: 10691 10692 # Explode 10693 added_columns += self.explode_infos( 10694 fields=[transcripts_column] + transcripts_infos_columns 10695 ) 10696 10697 # View clauses 10698 clause_select_variants = [] 10699 clause_select_tanscripts = [] 10700 for field in [transcripts_column] + transcripts_infos_columns: 10701 10702 # AS field 10703 as_field = field 10704 10705 # Rename 10706 if column_rename: 10707 as_field = column_rename.get(as_field, as_field) 10708 10709 # Clean 10710 if column_clean: 10711 as_field = clean_annotation_field(as_field) 10712 10713 # Case 10714 if column_case: 10715 if column_case.lower() in ["lower"]: 10716 as_field = as_field.lower() 10717 elif column_case.lower() in ["upper"]: 10718 as_field = as_field.upper() 10719 10720 # Clause select Variants 10721 clause_select_variants.append( 10722 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10723 ) 10724 10725 if field in [transcripts_column]: 10726 clause_select_tanscripts.append( 10727 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10728 ) 10729 else: 10730 clause_select_tanscripts.append( 10731 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10732 ) 10733 annotation_fields.append(as_field) 10734 10735 # Query View 10736 query = f""" 10737 SELECT 10738 "#CHROM", POS, REF, ALT, INFO, 10739 "{transcripts_column}" AS 'transcript', 10740 {", ".join(clause_select_tanscripts)} 10741 FROM ( 10742 SELECT 10743 "#CHROM", POS, REF, ALT, INFO, 10744 {", ".join(clause_select_variants)} 10745 FROM {table_variants} 10746 ) 10747 WHERE "{transcripts_column}" IS NOT NULL 10748 """ 10749 10750 # Create temporary table 10751 temporary_table = transcripts_table + "".join( 10752 random.choices(string.ascii_uppercase + string.digits, k=10) 10753 ) 10754 10755 # Temporary view 10756 temporary_tables.append(temporary_table) 10757 query_view = f""" 10758 CREATE view {temporary_table} 10759 AS ({query}) 10760 """ 10761 self.execute_query(query=query_view) 10762 10763 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10765 def create_transcript_view_from_column_format( 10766 self, 10767 transcripts_table: str = "transcripts", 10768 column_formats: dict = {}, 10769 temporary_tables: list = None, 10770 annotation_fields: list = None, 10771 column_rename: dict = {}, 10772 column_clean: bool = False, 10773 column_case: str = None, 10774 ) -> tuple[list, list, list]: 10775 """ 10776 The `create_transcript_view_from_column_format` function generates a transcript view based on 10777 specified column formats, adds additional columns and annotation fields, and returns the list of 10778 temporary tables and annotation fields. 10779 10780 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10781 of the table containing the transcripts data. This table will be used as the base table for 10782 creating the transcript view. The default value for this parameter is "transcripts", but you can 10783 provide a different table name if needed, defaults to transcripts 10784 :type transcripts_table: str (optional) 10785 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10786 about the columns to be used for creating the transcript view. Each entry in the dictionary 10787 specifies the mapping between a transcripts column and a transcripts infos column. This 10788 parameter allows you to define how the columns from the transcripts table should be transformed 10789 or mapped 10790 :type column_formats: dict 10791 :param temporary_tables: The `temporary_tables` parameter in the 10792 `create_transcript_view_from_column_format` function is a list that stores the names of 10793 temporary views created during the process of creating a transcript view from a column format. 10794 These temporary views are used to manipulate and extract data before generating the final 10795 transcript view 10796 :type temporary_tables: list 10797 :param annotation_fields: The `annotation_fields` parameter in the 10798 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10799 that are extracted from the temporary views created during the process. These annotation fields 10800 are obtained by querying the temporary views and extracting the column names excluding specific 10801 columns like `#CH 10802 :type annotation_fields: list 10803 :param column_rename: The `column_rename` parameter in the 10804 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10805 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10806 column names to new column names in this dictionary, you can rename specific columns during the 10807 process 10808 :type column_rename: dict 10809 :param column_clean: The `column_clean` parameter in the 10810 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10811 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10812 will be cleaned during the creation of the transcript view based on the specified column format, 10813 defaults to False 10814 :type column_clean: bool (optional) 10815 :param column_case: The `column_case` parameter in the 10816 `create_transcript_view_from_column_format` function is used to specify the case transformation 10817 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10818 to convert the column names to uppercase or lowercase, respectively 10819 :type column_case: str 10820 :return: The `create_transcript_view_from_column_format` function returns two lists: 10821 `temporary_tables` and `annotation_fields`. 10822 """ 10823 10824 log.debug("Start transcrpts view creation from column format...") 10825 10826 # "from_column_format": [ 10827 # { 10828 # "transcripts_column": "ANN", 10829 # "transcripts_infos_column": "Feature_ID", 10830 # } 10831 # ], 10832 10833 # Init 10834 if temporary_tables is None: 10835 temporary_tables = [] 10836 if annotation_fields is None: 10837 annotation_fields = [] 10838 10839 added_columns = [] 10840 10841 for column_format in column_formats: 10842 10843 # annotation field and transcript annotation field 10844 annotation_field = column_format.get("transcripts_column", "ANN") 10845 transcript_annotation = column_format.get( 10846 "transcripts_infos_column", "Feature_ID" 10847 ) 10848 10849 # Transcripts infos columns rename 10850 column_rename = column_format.get("column_rename", column_rename) 10851 10852 # Transcripts infos columns clean 10853 column_clean = column_format.get("column_clean", column_clean) 10854 10855 # Transcripts infos columns case 10856 column_case = column_format.get("column_case", column_case) 10857 10858 # Temporary View name 10859 temporary_view_name = transcripts_table + "".join( 10860 random.choices(string.ascii_uppercase + string.digits, k=10) 10861 ) 10862 10863 # Create temporary view name 10864 temporary_view_name, added_columns = self.annotation_format_to_table( 10865 annotation_field=annotation_field, 10866 view_name=temporary_view_name, 10867 annotation_id=transcript_annotation, 10868 column_rename=column_rename, 10869 column_clean=column_clean, 10870 column_case=column_case, 10871 ) 10872 10873 # Annotation fields 10874 if temporary_view_name: 10875 query_annotation_fields = f""" 10876 SELECT * 10877 FROM ( 10878 DESCRIBE SELECT * 10879 FROM {temporary_view_name} 10880 ) 10881 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10882 """ 10883 df_annotation_fields = self.get_query_to_df( 10884 query=query_annotation_fields 10885 ) 10886 10887 # Add temporary view and annotation fields 10888 temporary_tables.append(temporary_view_name) 10889 annotation_fields += list(set(df_annotation_fields["column_name"])) 10890 10891 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10893 def create_transcript_view( 10894 self, 10895 transcripts_table: str = None, 10896 transcripts_table_drop: bool = False, 10897 param: dict = {}, 10898 ) -> str: 10899 """ 10900 The `create_transcript_view` function generates a transcript view by processing data from a 10901 specified table based on provided parameters and structural information. 10902 10903 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10904 is used to specify the name of the table that will store the final transcript view data. If a table 10905 name is not provided, the function will create a new table to store the transcript view data, and by 10906 default,, defaults to transcripts 10907 :type transcripts_table: str (optional) 10908 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10909 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10910 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10911 the function will drop the existing transcripts table if it exists, defaults to False 10912 :type transcripts_table_drop: bool (optional) 10913 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10914 contains information needed to create a transcript view. It includes details such as the structure 10915 of the transcripts, columns mapping, column formats, and other necessary information for generating 10916 the view. This parameter allows for flexibility and customization 10917 :type param: dict 10918 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10919 created or modified during the execution of the function. 10920 """ 10921 10922 log.debug("Start transcripts view creation...") 10923 10924 # Default 10925 transcripts_table_default = "transcripts" 10926 10927 # Param 10928 if not param: 10929 param = self.get_param() 10930 10931 # Struct 10932 struct = param.get("transcripts", {}).get("struct", None) 10933 10934 # Transcript veresion 10935 transcript_id_remove_version = param.get("transcripts", {}).get( 10936 "transcript_id_remove_version", False 10937 ) 10938 10939 # Transcripts mapping 10940 transcript_id_mapping_file = param.get("transcripts", {}).get( 10941 "transcript_id_mapping_file", None 10942 ) 10943 10944 # Transcripts mapping 10945 transcript_id_mapping_force = param.get("transcripts", {}).get( 10946 "transcript_id_mapping_force", None 10947 ) 10948 10949 # Transcripts table 10950 if transcripts_table is None: 10951 transcripts_table = param.get("transcripts", {}).get( 10952 "table", transcripts_table_default 10953 ) 10954 10955 # Check transcripts table exists 10956 if transcripts_table: 10957 10958 # Query to check if transcripts table exists 10959 query_check_table = f""" 10960 SELECT * 10961 FROM information_schema.tables 10962 WHERE table_name = '{transcripts_table}' 10963 """ 10964 df_check_table = self.get_query_to_df(query=query_check_table) 10965 10966 # Check if transcripts table exists 10967 if len(df_check_table) > 0 and not transcripts_table_drop: 10968 log.debug(f"Table {transcripts_table} exists and not drop option") 10969 return transcripts_table 10970 10971 if struct: 10972 10973 # added_columns 10974 added_columns = [] 10975 10976 # Temporary tables 10977 temporary_tables = [] 10978 10979 # Annotation fields 10980 annotation_fields = [] 10981 10982 # from columns map 10983 columns_maps = struct.get("from_columns_map", []) 10984 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10985 self.create_transcript_view_from_columns_map( 10986 transcripts_table=transcripts_table, 10987 columns_maps=columns_maps, 10988 added_columns=added_columns, 10989 temporary_tables=temporary_tables, 10990 annotation_fields=annotation_fields, 10991 ) 10992 ) 10993 added_columns += added_columns_tmp 10994 temporary_tables += temporary_tables_tmp 10995 annotation_fields += annotation_fields_tmp 10996 10997 # from column format 10998 column_formats = struct.get("from_column_format", []) 10999 added_columns, temporary_tables_tmp, annotation_fields_tmp = ( 11000 self.create_transcript_view_from_column_format( 11001 transcripts_table=transcripts_table, 11002 column_formats=column_formats, 11003 temporary_tables=temporary_tables, 11004 annotation_fields=annotation_fields, 11005 ) 11006 ) 11007 added_columns += added_columns_tmp 11008 temporary_tables += temporary_tables_tmp 11009 annotation_fields += annotation_fields_tmp 11010 11011 # Remove some specific fields/column 11012 annotation_fields = list(set(annotation_fields)) 11013 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 11014 if field in annotation_fields: 11015 annotation_fields.remove(field) 11016 11017 # Merge temporary tables query 11018 query_merge = "" 11019 for temporary_table in list(set(temporary_tables)): 11020 11021 # First temporary table 11022 if not query_merge: 11023 query_merge = f""" 11024 SELECT * FROM {temporary_table} 11025 """ 11026 # other temporary table (using UNION) 11027 else: 11028 query_merge += f""" 11029 UNION BY NAME SELECT * FROM {temporary_table} 11030 """ 11031 11032 # transcript table tmp 11033 transcript_table_tmp = "transcripts_tmp" 11034 transcript_table_tmp2 = "transcripts_tmp2" 11035 transcript_table_tmp3 = "transcripts_tmp3" 11036 11037 # Merge on transcript 11038 query_merge_on_transcripts_annotation_fields = [] 11039 11040 # Add transcript list 11041 query_merge_on_transcripts_annotation_fields.append( 11042 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 11043 ) 11044 11045 # Aggregate all annotations fields 11046 for annotation_field in set(annotation_fields): 11047 query_merge_on_transcripts_annotation_fields.append( 11048 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 11049 ) 11050 11051 # Transcripts mapping 11052 if transcript_id_mapping_file: 11053 11054 # Transcript dataframe 11055 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 11056 transcript_id_mapping_dataframe = transcripts_file_to_df( 11057 transcript_id_mapping_file, column_names=["transcript", "alias"] 11058 ) 11059 11060 # Transcript version remove 11061 if transcript_id_remove_version: 11062 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 11063 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 11064 query_left_join = f""" 11065 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11066 """ 11067 else: 11068 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 11069 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 11070 query_left_join = f""" 11071 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11072 """ 11073 11074 # Transcript column for group by merge 11075 query_transcript_merge_group_by = """ 11076 CASE 11077 WHEN transcript_mapped NOT IN ('') 11078 THEN split_part(transcript_mapped, '.', 1) 11079 ELSE split_part(transcript_original, '.', 1) 11080 END 11081 """ 11082 11083 # Merge query 11084 transcripts_tmp2_query = f""" 11085 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 11086 FROM ({query_merge}) AS {transcript_table_tmp} 11087 {query_left_join} 11088 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 11089 """ 11090 11091 # Retrive columns after mege 11092 transcripts_tmp2_describe_query = f""" 11093 DESCRIBE {transcripts_tmp2_query} 11094 """ 11095 transcripts_tmp2_describe_list = list( 11096 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 11097 "column_name" 11098 ] 11099 ) 11100 11101 # Create list of columns for select clause 11102 transcripts_tmp2_describe_select_clause = [] 11103 for field in transcripts_tmp2_describe_list: 11104 if field not in [ 11105 "#CHROM", 11106 "POS", 11107 "REF", 11108 "ALT", 11109 "INFO", 11110 "transcript_mapped", 11111 ]: 11112 as_field = field 11113 if field in ["transcript_original"]: 11114 as_field = "transcripts_mapped" 11115 transcripts_tmp2_describe_select_clause.append( 11116 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11117 ) 11118 11119 # Merge with mapping 11120 query_merge_on_transcripts = f""" 11121 SELECT 11122 "#CHROM", POS, REF, ALT, INFO, 11123 CASE 11124 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11125 THEN ANY_VALUE(transcript_mapped) 11126 ELSE ANY_VALUE(transcript_original) 11127 END AS transcript, 11128 {", ".join(transcripts_tmp2_describe_select_clause)} 11129 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11130 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11131 {query_transcript_merge_group_by} 11132 """ 11133 11134 # Add transcript filter from mapping file 11135 if transcript_id_mapping_force: 11136 query_merge_on_transcripts = f""" 11137 SELECT * 11138 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11139 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11140 """ 11141 11142 # No transcript mapping 11143 else: 11144 11145 # Remove transcript version 11146 if transcript_id_remove_version: 11147 query_transcript_column = f""" 11148 split_part({transcript_table_tmp}.transcript, '.', 1) 11149 """ 11150 else: 11151 query_transcript_column = """ 11152 transcript 11153 """ 11154 11155 # Query sections 11156 query_transcript_column_select = ( 11157 f"{query_transcript_column} AS transcript" 11158 ) 11159 query_transcript_column_group_by = query_transcript_column 11160 11161 # Query for transcripts view 11162 query_merge_on_transcripts = f""" 11163 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11164 FROM ({query_merge}) AS {transcript_table_tmp} 11165 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11166 """ 11167 11168 # Drop transcript view is necessary 11169 if transcripts_table_drop: 11170 query_drop = f""" 11171 DROP TABLE IF EXISTS {transcripts_table}; 11172 """ 11173 self.execute_query(query=query_drop) 11174 11175 # List of unique #CHROM 11176 query_unique_chrom = f""" 11177 SELECT DISTINCT "#CHROM" 11178 FROM variants AS subquery 11179 """ 11180 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11181 11182 # Create table with structure but without data, if not exists 11183 query_create_table = f""" 11184 CREATE TABLE IF NOT EXISTS {transcripts_table} AS 11185 SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0 11186 """ 11187 self.execute_query(query=query_create_table) 11188 11189 # Process by #CHROM 11190 for chrom in unique_chroms["#CHROM"]: 11191 11192 # Log 11193 log.debug(f"Processing #CHROM={chrom}") 11194 11195 # Select data by #CHROM 11196 query_chunk = f""" 11197 SELECT * 11198 FROM ({query_merge_on_transcripts}) 11199 WHERE "#CHROM" = '{chrom}' 11200 """ 11201 11202 # Insert data 11203 query_insert_chunk = f""" 11204 INSERT INTO {transcripts_table} 11205 {query_chunk} 11206 """ 11207 self.execute_query(query=query_insert_chunk) 11208 11209 # Remove temporary tables 11210 if temporary_tables: 11211 for temporary_table in list(set(temporary_tables)): 11212 try: 11213 query_drop_tmp_table = f""" 11214 DROP TABLE IF EXISTS {temporary_table} 11215 """ 11216 self.execute_query(query=query_drop_tmp_table) 11217 except Exception as e: 11218 log.debug(f"'{temporary_table}' Not a table") 11219 try: 11220 query_drop_tmp_table = f""" 11221 DROP VIEW IF EXISTS {temporary_table} 11222 """ 11223 self.execute_query(query=query_drop_tmp_table) 11224 except Exception as e: 11225 log.debug(f"'{temporary_table}' Not a view") 11226 11227 # Remove added columns 11228 for added_column in added_columns: 11229 self.drop_column(column=added_column) 11230 11231 else: 11232 11233 transcripts_table = None 11234 11235 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to False - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11237 def annotation_format_to_table( 11238 self, 11239 annotation_field: str = "ANN", 11240 annotation_id: str = "Feature_ID", 11241 view_name: str = "transcripts", 11242 column_rename: dict = {}, 11243 column_clean: bool = False, 11244 column_case: str = None, 11245 ) -> str: 11246 """ 11247 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11248 structured table format, ensuring unique values and creating a temporary table for further 11249 processing or analysis. 11250 11251 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11252 unique values in the output or not. If set to `True`, the function will make sure that the 11253 output values are unique, defaults to True 11254 :type uniquify: bool (optional) 11255 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11256 that contains the annotation information for each variant. This field is used to extract the 11257 annotation details for further processing in the function. By default, it is set to "ANN", 11258 defaults to ANN 11259 :type annotation_field: str (optional) 11260 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11261 is used to specify the identifier for the annotation feature. This identifier will be used as a 11262 column name in the resulting table or view that is created based on the annotation data. It 11263 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11264 :type annotation_id: str (optional) 11265 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11266 to specify the name of the temporary table that will be created to store the transformed 11267 annotation data. This table will hold the extracted information from the annotation field in a 11268 structured format for further processing or analysis. By default,, defaults to transcripts 11269 :type view_name: str (optional) 11270 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11271 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11272 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11273 created based on the annotation data. This feature enables 11274 :type column_rename: dict 11275 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11276 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11277 If set to `True`, the function will clean the annotation field before further processing. This 11278 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11279 to False 11280 :type column_clean: bool (optional) 11281 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11282 used to specify the case transformation to be applied to the column names extracted from the 11283 annotation data. It allows you to set the case of the column names to either lowercase or 11284 uppercase for consistency or other specific requirements during the conversion 11285 :type column_case: str 11286 :return: The function `annotation_format_to_table` is returning the name of the view created, 11287 which is stored in the variable `view_name`. 11288 """ 11289 11290 # Transcript annotation 11291 if column_rename: 11292 annotation_id = column_rename.get(annotation_id, annotation_id) 11293 11294 if column_clean: 11295 annotation_id = clean_annotation_field(annotation_id) 11296 11297 # Prefix 11298 prefix = self.get_explode_infos_prefix() 11299 if prefix: 11300 prefix = "INFO/" 11301 11302 # Variants table 11303 table_variants = self.get_table_variants() 11304 11305 # Header 11306 vcf_reader = self.get_header() 11307 11308 # Add columns 11309 added_columns = [] 11310 11311 # Explode HGVS field in column 11312 added_columns += self.explode_infos(fields=[annotation_field]) 11313 11314 if annotation_field in vcf_reader.infos: 11315 11316 # Extract ANN header 11317 ann_description = vcf_reader.infos[annotation_field].desc 11318 pattern = r"'(.+?)'" 11319 match = re.search(pattern, ann_description) 11320 if match: 11321 ann_header_match = match.group(1).split(" | ") 11322 ann_header = [] 11323 ann_header_desc = {} 11324 for i in range(len(ann_header_match)): 11325 ann_header_info = "".join( 11326 char for char in ann_header_match[i] if char.isalnum() 11327 ) 11328 ann_header.append(ann_header_info) 11329 ann_header_desc[ann_header_info] = ann_header_match[i] 11330 if not ann_header_desc: 11331 raise ValueError("Invalid header description format") 11332 else: 11333 raise ValueError("Invalid header description format") 11334 11335 # Create dataframe for keys column type 11336 dataframe_annotation_format = self.get_query_to_df( 11337 f""" 11338 WITH exploded_annotations AS ( 11339 SELECT 11340 UNNEST(STRING_SPLIT(ANN, ',')) AS annotation 11341 FROM {table_variants} 11342 ), 11343 split_annotations AS ( 11344 SELECT 11345 {", ".join([f"SPLIT_PART(annotation, '|', {i+1}) AS '{header}'" for i, header in enumerate(ann_header_desc.values())])}, 11346 FROM exploded_annotations 11347 ) 11348 SELECT * FROM split_annotations 11349 LIMIT 1000 11350 """ 11351 ) 11352 11353 # Init 11354 query_list_keys = [] 11355 key_i = 0 11356 11357 for key in dataframe_annotation_format.keys(): 11358 11359 # Key 11360 key_i += 1 11361 key_clean = key 11362 11363 # key rename 11364 if column_rename: 11365 key_clean = column_rename.get(key_clean, key_clean) 11366 11367 # key clean 11368 if column_clean: 11369 key_clean = clean_annotation_field(key_clean) 11370 11371 # Key case 11372 if column_case: 11373 if column_case.lower() in ["lower"]: 11374 key_clean = key_clean.lower() 11375 elif column_case.lower() in ["upper"]: 11376 key_clean = key_clean.upper() 11377 11378 # Detect column type 11379 column_type = detect_column_type(dataframe_annotation_format[key]) 11380 11381 # Append key to list 11382 query_list_keys.append( 11383 f""" NULLIF(SPLIT_PART(annotation, '|', {key_i}), '')::{column_type} AS '{prefix}{key_clean}' """ 11384 ) 11385 11386 # Create temporary table 11387 query_create_view = f""" 11388 CREATE VIEW {view_name} AS ( 11389 WITH exploded_annotations AS ( 11390 SELECT 11391 "#CHROM", 11392 POS, 11393 REF, 11394 ALT, 11395 INFO, 11396 UNNEST(STRING_SPLIT(ANN, ',')) AS annotation 11397 FROM {table_variants} 11398 ), 11399 split_annotations AS ( 11400 SELECT 11401 "#CHROM", 11402 POS, 11403 REF, 11404 ALT, 11405 INFO, 11406 {", ".join(query_list_keys)}, 11407 FROM exploded_annotations 11408 ) 11409 SELECT *, {annotation_id} AS 'transcript' FROM split_annotations 11410 ) 11411 """ 11412 log.debug(f"query_create_view: {query_create_view}") 11413 self.execute_query(query=query_create_view) 11414 11415 else: 11416 11417 # Return None 11418 view_name = None 11419 11420 return view_name, added_columns
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11422 def transcript_view_to_variants( 11423 self, 11424 transcripts_table: str = None, 11425 transcripts_column_id: str = None, 11426 transcripts_info_json: str = None, 11427 transcripts_info_field_json: str = None, 11428 transcripts_info_format: str = None, 11429 transcripts_info_field_format: str = None, 11430 param: dict = {}, 11431 ) -> bool: 11432 """ 11433 The `transcript_view_to_variants` function updates a variants table with information from 11434 transcripts in JSON format. 11435 11436 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11437 table containing the transcripts data. If this parameter is not provided, the function will 11438 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11439 :type transcripts_table: str 11440 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11441 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11442 identifier is used to match transcripts with variants in the database 11443 :type transcripts_column_id: str 11444 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11445 of the column in the variants table where the transcripts information will be stored in JSON 11446 format. This parameter allows you to define the column in the variants table that will hold the 11447 JSON-formatted information about transcripts 11448 :type transcripts_info_json: str 11449 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11450 specify the field in the VCF header that will contain information about transcripts in JSON 11451 format. This field will be added to the VCF header as an INFO field with the specified name 11452 :type transcripts_info_field_json: str 11453 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11454 format of the information about transcripts that will be stored in the variants table. This 11455 format can be used to define how the transcript information will be structured or displayed 11456 within the variants table 11457 :type transcripts_info_format: str 11458 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11459 specify the field in the VCF header that will contain information about transcripts in a 11460 specific format. This field will be added to the VCF header as an INFO field with the specified 11461 name 11462 :type transcripts_info_field_format: str 11463 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11464 that contains various configuration settings related to transcripts. It is used to provide 11465 default values for certain parameters if they are not explicitly provided when calling the 11466 method. The `param` dictionary can be passed as an argument 11467 :type param: dict 11468 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11469 if the operation is successful and `False` if certain conditions are not met. 11470 """ 11471 11472 msg_info_prefix = "Start transcripts view to variants annotations" 11473 11474 log.debug(f"{msg_info_prefix}...") 11475 11476 # Default 11477 transcripts_table_default = "transcripts" 11478 transcripts_column_id_default = "transcript" 11479 transcripts_info_json_default = None 11480 transcripts_info_format_default = None 11481 transcripts_info_field_json_default = None 11482 transcripts_info_field_format_default = None 11483 11484 # Param 11485 if not param: 11486 param = self.get_param() 11487 11488 # Transcripts table 11489 if transcripts_table is None: 11490 transcripts_table = param.get("transcripts", {}).get( 11491 "table", transcripts_table_default 11492 ) 11493 11494 # Transcripts column ID 11495 if transcripts_column_id is None: 11496 transcripts_column_id = param.get("transcripts", {}).get( 11497 "column_id", transcripts_column_id_default 11498 ) 11499 11500 # Transcripts info json 11501 if transcripts_info_json is None: 11502 transcripts_info_json = param.get("transcripts", {}).get( 11503 "transcripts_info_json", transcripts_info_json_default 11504 ) 11505 11506 # Transcripts info field JSON 11507 if transcripts_info_field_json is None: 11508 transcripts_info_field_json = param.get("transcripts", {}).get( 11509 "transcripts_info_field_json", transcripts_info_field_json_default 11510 ) 11511 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11512 # transcripts_info_json = transcripts_info_field_json 11513 11514 # Transcripts info format 11515 if transcripts_info_format is None: 11516 transcripts_info_format = param.get("transcripts", {}).get( 11517 "transcripts_info_format", transcripts_info_format_default 11518 ) 11519 11520 # Transcripts info field FORMAT 11521 if transcripts_info_field_format is None: 11522 transcripts_info_field_format = param.get("transcripts", {}).get( 11523 "transcripts_info_field_format", transcripts_info_field_format_default 11524 ) 11525 # if ( 11526 # transcripts_info_field_format is not None 11527 # and transcripts_info_format is None 11528 # ): 11529 # transcripts_info_format = transcripts_info_field_format 11530 11531 # Variants table 11532 table_variants = self.get_table_variants() 11533 11534 # Check info columns param 11535 if ( 11536 transcripts_info_json is None 11537 and transcripts_info_field_json is None 11538 and transcripts_info_format is None 11539 and transcripts_info_field_format is None 11540 ): 11541 return False 11542 11543 # Transcripts infos columns 11544 query_transcripts_infos_columns = f""" 11545 SELECT * 11546 FROM ( 11547 DESCRIBE SELECT * FROM {transcripts_table} 11548 ) 11549 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11550 """ 11551 transcripts_infos_columns = list( 11552 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11553 ) 11554 11555 # View results 11556 clause_select = [] 11557 clause_to_json = [] 11558 clause_to_format = [] 11559 for field in transcripts_infos_columns: 11560 # Do not consider INFO field for export into fields 11561 if field not in ["INFO"]: 11562 clause_select.append( 11563 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11564 ) 11565 clause_to_json.append(f""" '{field}': "{field}" """) 11566 clause_to_format.append(f""" "{field}" """) 11567 11568 # Update 11569 update_set_json = [] 11570 update_set_format = [] 11571 11572 # VCF header 11573 vcf_reader = self.get_header() 11574 11575 # Transcripts to info column in JSON 11576 if transcripts_info_json: 11577 11578 # Create column on variants table 11579 self.add_column( 11580 table_name=table_variants, 11581 column_name=transcripts_info_json, 11582 column_type="JSON", 11583 default_value=None, 11584 drop=False, 11585 ) 11586 11587 # Add header 11588 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11589 transcripts_info_json, 11590 ".", 11591 "String", 11592 "Transcripts in JSON format", 11593 "unknwon", 11594 "unknwon", 11595 self.code_type_map["String"], 11596 ) 11597 11598 # Add to update 11599 update_set_json.append( 11600 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11601 ) 11602 11603 # Transcripts to info field in JSON 11604 if transcripts_info_field_json: 11605 11606 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11607 11608 # Add to update 11609 update_set_json.append( 11610 f""" 11611 INFO = concat( 11612 CASE 11613 WHEN INFO NOT IN ('', '.') 11614 THEN INFO 11615 ELSE '' 11616 END, 11617 CASE 11618 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11619 THEN concat( 11620 ';{transcripts_info_field_json}=', 11621 t.{transcripts_info_json} 11622 ) 11623 ELSE '' 11624 END 11625 ) 11626 """ 11627 ) 11628 11629 # Add header 11630 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11631 transcripts_info_field_json, 11632 ".", 11633 "String", 11634 "Transcripts in JSON format", 11635 "unknwon", 11636 "unknwon", 11637 self.code_type_map["String"], 11638 ) 11639 11640 if update_set_json: 11641 11642 # Update query 11643 query_update = f""" 11644 UPDATE {table_variants} 11645 SET {", ".join(update_set_json)} 11646 FROM 11647 ( 11648 SELECT 11649 "#CHROM", POS, REF, ALT, 11650 concat( 11651 '{{', 11652 string_agg( 11653 '"' || "{transcripts_column_id}" || '":' || 11654 to_json(json_output) 11655 ), 11656 '}}' 11657 )::JSON AS {transcripts_info_json} 11658 FROM 11659 ( 11660 SELECT 11661 "#CHROM", POS, REF, ALT, 11662 "{transcripts_column_id}", 11663 to_json( 11664 {{{",".join(clause_to_json)}}} 11665 )::JSON AS json_output 11666 FROM 11667 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11668 WHERE "{transcripts_column_id}" IS NOT NULL 11669 ) 11670 GROUP BY "#CHROM", POS, REF, ALT 11671 ) AS t 11672 WHERE {table_variants}."#CHROM" = t."#CHROM" 11673 AND {table_variants}."POS" = t."POS" 11674 AND {table_variants}."REF" = t."REF" 11675 AND {table_variants}."ALT" = t."ALT" 11676 """ 11677 11678 self.execute_query(query=query_update) 11679 11680 # Transcripts to info column in FORMAT 11681 if transcripts_info_format: 11682 11683 # Create column on variants table 11684 self.add_column( 11685 table_name=table_variants, 11686 column_name=transcripts_info_format, 11687 column_type="VARCHAR", 11688 default_value=None, 11689 drop=False, 11690 ) 11691 11692 # Add header 11693 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11694 transcripts_info_format, 11695 ".", 11696 "String", 11697 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11698 "unknwon", 11699 "unknwon", 11700 self.code_type_map["String"], 11701 ) 11702 11703 # Add to update 11704 update_set_format.append( 11705 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11706 ) 11707 11708 else: 11709 11710 # Set variable for internal queries 11711 transcripts_info_format = "transcripts_info_format" 11712 11713 # Transcripts to info field in JSON 11714 if transcripts_info_field_format: 11715 11716 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11717 11718 # Add to update 11719 update_set_format.append( 11720 f""" 11721 INFO = concat( 11722 CASE 11723 WHEN INFO NOT IN ('', '.') 11724 THEN INFO 11725 ELSE '' 11726 END, 11727 CASE 11728 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11729 THEN concat( 11730 ';{transcripts_info_field_format}=', 11731 t.{transcripts_info_format} 11732 ) 11733 ELSE '' 11734 END 11735 ) 11736 """ 11737 ) 11738 11739 # Add header 11740 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11741 transcripts_info_field_format, 11742 ".", 11743 "String", 11744 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11745 "unknwon", 11746 "unknwon", 11747 self.code_type_map["String"], 11748 ) 11749 11750 if update_set_format: 11751 11752 # Update query 11753 query_update = f""" 11754 UPDATE {table_variants} 11755 SET {", ".join(update_set_format)} 11756 FROM 11757 ( 11758 SELECT 11759 "#CHROM", POS, REF, ALT, 11760 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11761 FROM 11762 ( 11763 SELECT 11764 "#CHROM", POS, REF, ALT, 11765 "{transcripts_column_id}", 11766 concat( 11767 "{transcripts_column_id}", 11768 '|', 11769 {", '|', ".join(clause_to_format)} 11770 ) AS {transcripts_info_format} 11771 FROM 11772 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11773 ) 11774 GROUP BY "#CHROM", POS, REF, ALT 11775 ) AS t 11776 WHERE {table_variants}."#CHROM" = t."#CHROM" 11777 AND {table_variants}."POS" = t."POS" 11778 AND {table_variants}."REF" = t."REF" 11779 AND {table_variants}."ALT" = t."ALT" 11780 """ 11781 11782 self.execute_query(query=query_update) 11783 11784 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.
11786 def rename_info_fields( 11787 self, fields_to_rename: dict = None, table: str = None 11788 ) -> dict: 11789 """ 11790 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11791 corresponding INFO fields in the variants table. 11792 11793 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11794 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11795 represent the original field names that need to be renamed, and the corresponding values 11796 represent the new names to which the fields should be 11797 :type fields_to_rename: dict 11798 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11799 the table in which the variants data is stored. This table contains information about genetic 11800 variants, and the function updates the corresponding INFO fields in this table when renaming 11801 specified fields in the VCF file header 11802 :type table: str 11803 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11804 the original field names as keys and their corresponding new names (or None if the field was 11805 removed) as values after renaming or removing specified fields in a VCF file header and updating 11806 corresponding INFO fields in the variants table. 11807 """ 11808 11809 # Init 11810 fields_renamed = {} 11811 config = self.get_config() 11812 access = config.get("access") 11813 11814 if table is None: 11815 table = self.get_table_variants() 11816 11817 # regexp replace fonction 11818 regex_replace_dict = {} 11819 regex_replace_nb = 0 11820 regex_replace_partition = 125 11821 regex_replace = "concat(INFO, ';')" # Add ';' to reduce regexp comlexity 11822 11823 if fields_to_rename is not None and access not in ["RO"]: 11824 11825 log.info("Rename or remove fields...") 11826 11827 # Header 11828 header = self.get_header() 11829 11830 for field_to_rename, field_renamed in fields_to_rename.items(): 11831 11832 if field_to_rename in header.infos: 11833 11834 # Rename header 11835 if field_renamed is not None: 11836 header.infos[field_renamed] = vcf.parser._Info( 11837 field_renamed, 11838 header.infos[field_to_rename].num, 11839 header.infos[field_to_rename].type, 11840 header.infos[field_to_rename].desc, 11841 header.infos[field_to_rename].source, 11842 header.infos[field_to_rename].version, 11843 header.infos[field_to_rename].type_code, 11844 ) 11845 del header.infos[field_to_rename] 11846 11847 # Rename INFO patterns 11848 field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;" 11849 if field_renamed is not None: 11850 field_renamed_pattern = rf"\1{field_renamed}\3;" 11851 else: 11852 field_renamed_pattern = r"\1" 11853 11854 # regexp replace 11855 regex_replace_nb += 1 11856 regex_replace_key = math.floor( 11857 regex_replace_nb / regex_replace_partition 11858 ) 11859 if (regex_replace_nb % regex_replace_partition) == 0: 11860 regex_replace = "concat(INFO, ';')" 11861 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11862 regex_replace_dict[regex_replace_key] = regex_replace 11863 11864 # Return 11865 fields_renamed[field_to_rename] = field_renamed 11866 11867 # Log 11868 if field_renamed is not None: 11869 log.info( 11870 f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'" 11871 ) 11872 else: 11873 log.info( 11874 f"Rename or remove fields - field '{field_to_rename}' removed" 11875 ) 11876 11877 else: 11878 11879 log.warning( 11880 f"Rename or remove fields - field '{field_to_rename}' not in header" 11881 ) 11882 11883 # Rename INFO 11884 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11885 log.info( 11886 f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..." 11887 ) 11888 query = f""" 11889 UPDATE {table} 11890 SET 11891 INFO = regexp_replace({regex_replace}, ';$', '') 11892 """ 11893 log.debug(f"query={query}") 11894 self.execute_query(query=query) 11895 11896 return fields_renamed
The rename_info_fields function renames specified fields in a VCF file header and updates
corresponding INFO fields in the variants table.
Parameters
- fields_to_rename: The
fields_to_renameparameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be - table: The
tableparameter in therename_info_fieldsfunction represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns
The
rename_info_fieldsfunction returns a dictionaryfields_renamedthat contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.
11898 def calculation_rename_info_fields( 11899 self, 11900 fields_to_rename: dict = None, 11901 table: str = None, 11902 operation_name: str = "RENAME_INFO_FIELDS", 11903 ) -> None: 11904 """ 11905 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11906 fields to rename and table if provided, and then calls another function to rename the fields. 11907 11908 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11909 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11910 the key and the new field name as the value 11911 :type fields_to_rename: dict 11912 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11913 specify the name of the table for which the fields are to be renamed. It is a string type 11914 parameter 11915 :type table: str 11916 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11917 method is a string that specifies the name of the operation being performed. In this context, it 11918 is used as a default value for the operation name if not explicitly provided when calling the 11919 function, defaults to RENAME_INFO_FIELDS 11920 :type operation_name: str (optional) 11921 """ 11922 11923 # Param 11924 param = self.get_param() 11925 11926 # Get param fields to rename 11927 param_fields_to_rename = ( 11928 param.get("calculation", {}) 11929 .get("calculations", {}) 11930 .get(operation_name, {}) 11931 .get("fields_to_rename", None) 11932 ) 11933 11934 # Get param table 11935 param_table = ( 11936 param.get("calculation", {}) 11937 .get("calculations", {}) 11938 .get(operation_name, {}) 11939 .get("table", None) 11940 ) 11941 11942 # Init fields_to_rename 11943 if fields_to_rename is None: 11944 fields_to_rename = param_fields_to_rename 11945 11946 # Init table 11947 if table is None: 11948 table = param_table 11949 11950 renamed_fields = self.rename_info_fields( 11951 fields_to_rename=fields_to_rename, table=table 11952 ) 11953 11954 log.debug(f"renamed_fields:{renamed_fields}")
The calculation_rename_info_fields function retrieves parameters from a dictionary, updates
fields to rename and table if provided, and then calls another function to rename the fields.
Parameters
- fields_to_rename:
fields_to_renameis a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value - table: The
tableparameter in thecalculation_rename_info_fieldsmethod is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter - operation_name: The
operation_nameparameter in thecalculation_rename_info_fieldsmethod is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS
11956 def create_annotations_view( 11957 self, 11958 table: str = None, 11959 view: str = None, 11960 view_type: str = None, 11961 fields: list = None, 11962 prefix: str = "", 11963 drop_view: bool = False, 11964 fields_to_rename: dict = None, 11965 limit: int = None, 11966 ) -> str: 11967 """ 11968 The `create_annotations_view` function creates a SQL view from fields in a VCF INFO column. 11969 11970 :param table: The `table` parameter in the `create_annotations_view` function is used to specify 11971 the name of the table from which the fields are to be extracted. This table contains the 11972 variants data, and the function creates a view based on the fields in the INFO column of this 11973 table 11974 :type table: str 11975 :param view: The `view` parameter in the `create_annotations_view` function is used to specify 11976 the name of the view that will be created based on the fields in the VCF INFO column. This view 11977 will contain the extracted fields from the INFO column in a structured format for further 11978 processing or analysis 11979 :type view: str 11980 :param view_type: The `view_type` parameter in the `create_annotations_view` function is used to 11981 specify the type of view that will be created. It can be either a `VIEW` or a `TABLE`, and the 11982 function will create the view based on the specified type 11983 :type view_type: str 11984 :param fields: The `fields` parameter in the `create_annotations_view` function is a list that 11985 contains the names of the fields to be extracted from the INFO column in the VCF file. These 11986 fields will be used to create the view with the specified columns and data extracted from the 11987 INFO column 11988 :type fields: list 11989 :param prefix: The `prefix` parameter in the `create_annotations_view` function is used to 11990 specify a prefix that will be added to the field names in the view. This prefix helps in 11991 distinguishing the fields extracted from the INFO column in the view 11992 :type prefix: str 11993 :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean 11994 flag that determines whether to drop the existing view with the same name before creating a new 11995 view. If set to `True`, the function will drop the existing view before creating a new view with 11996 the specified name 11997 :type drop_view: bool 11998 :param fields_to_rename: The `fields_to_rename` parameter in the `create_annotations_view` 11999 function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The 12000 keys in the dictionary represent the original field names that need to be renamed, and the 12001 corresponding values represent the new names to which the fields should be 12002 :type fields_to_rename: dict 12003 :param limit: The `limit` parameter in the `create_annotations_view` function is an integer that 12004 specifies the maximum number of rows to be included in the view. If provided, the function will 12005 limit the number of rows in the view to the specified value 12006 :type limit: int 12007 :return: The `create_annotations_view` function returns the name of the view that is created 12008 based on the fields extracted from the INFO column in the VCF file. This view contains the 12009 extracted fields in a structured format for further processing or analysis 12010 """ 12011 12012 # Create a sql view from fields in VCF INFO column, with each column is a field present in the VCF header (with a specific type from VCF header) and extracted from INFO column (with a regexp like in rename_info_fields), and each row is a variant. 12013 12014 # Get table 12015 if table is None: 12016 table = self.get_table_variants() 12017 12018 # Get view 12019 if view is None: 12020 view = f"{table}_annotations" 12021 12022 # Get view type 12023 if view_type is None: 12024 view_type = "VIEW" 12025 12026 # Check view type value 12027 if view_type.upper() not in ["VIEW", "TABLE"]: 12028 raise ValueError( 12029 f"Invalid view type value: {view_type}. Either 'VIEW' or 'TABLE'" 12030 ) 12031 12032 # Get header 12033 header = self.get_header() 12034 12035 # Get fields 12036 if fields is None: 12037 fields = list(header.infos.keys()) 12038 12039 # Get fields to rename 12040 if fields_to_rename is None: 12041 fields_to_rename = {} 12042 12043 log.info( 12044 f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields" 12045 ) 12046 12047 # Describe table 12048 table_describe_query = f""" 12049 DESCRIBE {table} 12050 """ 12051 table_describe = self.get_query_to_df(query=table_describe_query) 12052 12053 # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header 12054 fields_columns = [] 12055 fields_needed = ["#CHROM", "POS", "REF", "ALT"] 12056 field_sql_type_list = False 12057 for field in fields: 12058 12059 # Rename field 12060 field_to_rename = fields_to_rename.get(field, field) 12061 12062 # Check field type 12063 12064 # Needed fields 12065 if field in fields_needed: 12066 continue 12067 12068 # Fields in table 12069 elif field in list(table_describe.get("column_name")): 12070 fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """) 12071 12072 # Fields in header 12073 elif field in header.infos: 12074 12075 # Field info 12076 field_infos = header.infos.get(field, None) 12077 12078 # Field SQL type 12079 field_sql_type = code_type_map_to_sql.get(field_infos.type, "VARCHAR") 12080 12081 # Column is a list 12082 if field_infos.num != 1: 12083 field_sql_type_list = True 12084 12085 # Colonne is a flag 12086 if field_infos.type == "Flag": 12087 field_pattern = rf"(^|;)({field})([^;]*)?" 12088 fields_columns.append( 12089 f""" regexp_matches("INFO", '{field_pattern}')::BOOLEAN AS '{prefix}{field_to_rename}' """ 12090 ) 12091 12092 # Colonne with a type 12093 else: 12094 12095 # Field pattern 12096 field_pattern = rf"(^|;)({field})=([^;]*)?" 12097 12098 # Field is a list 12099 if field_sql_type_list: 12100 fields_columns.append( 12101 f""" CAST(list_transform(string_split(NULLIF(regexp_extract("INFO", '{field_pattern}', 3), ''), ','), x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END) AS {field_sql_type}[]) AS '{prefix}{field_to_rename}' """ 12102 ) 12103 12104 # Field is a unique value 12105 else: 12106 fields_columns.append( 12107 f""" NULLIF(regexp_replace(regexp_extract("INFO", '{field_pattern}', 3), '^\\.$', ''), '')::{field_sql_type} AS '{prefix}{field_to_rename}' """ 12108 ) 12109 12110 else: 12111 fields_columns.append(f""" null AS '{prefix}{field_to_rename}' """) 12112 msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL" 12113 log.warning(msg=msg_err) 12114 12115 # Limit 12116 limit_clause = "" 12117 if limit is not None: 12118 limit_clause = f" LIMIT {limit} " 12119 12120 # Query select 12121 query_select = f""" 12122 SELECT 12123 {', '.join([f'"{field}"' for field in fields_needed])}, {", ".join(fields_columns)} 12124 FROM 12125 {table} 12126 {limit_clause} 12127 """ 12128 12129 # Drop if any 12130 if drop_view: 12131 log.debug(f"Drop view: {view}") 12132 query_create_view = f""" 12133 DROP {view_type} IF EXISTS {view} 12134 """ 12135 self.execute_query(query=query_create_view) 12136 log.debug(f"View dropped: {view}") 12137 12138 # Create view 12139 log.debug(f"Create view: {view}") 12140 query_create_view = f""" 12141 CREATE {view_type} IF NOT EXISTS {view} AS {query_select} 12142 """ 12143 # log.debug(f"query_create_view:{query_create_view}") 12144 self.execute_query(query=query_create_view) 12145 log.debug(f"View created: {view}") 12146 12147 return view
The create_annotations_view function creates a SQL view from fields in a VCF INFO column.
Parameters
- table: The
tableparameter in thecreate_annotations_viewfunction is used to specify the name of the table from which the fields are to be extracted. This table contains the variants data, and the function creates a view based on the fields in the INFO column of this table - view: The
viewparameter in thecreate_annotations_viewfunction is used to specify the name of the view that will be created based on the fields in the VCF INFO column. This view will contain the extracted fields from the INFO column in a structured format for further processing or analysis - view_type: The
view_typeparameter in thecreate_annotations_viewfunction is used to specify the type of view that will be created. It can be either aVIEWor aTABLE, and the function will create the view based on the specified type - fields: The
fieldsparameter in thecreate_annotations_viewfunction is a list that contains the names of the fields to be extracted from the INFO column in the VCF file. These fields will be used to create the view with the specified columns and data extracted from the INFO column - prefix: The
prefixparameter in thecreate_annotations_viewfunction is used to specify a prefix that will be added to the field names in the view. This prefix helps in distinguishing the fields extracted from the INFO column in the view - drop_view: The
drop_viewparameter in thecreate_annotations_viewfunction is a boolean flag that determines whether to drop the existing view with the same name before creating a new view. If set toTrue, the function will drop the existing view before creating a new view with the specified name - fields_to_rename: The
fields_to_renameparameter in thecreate_annotations_viewfunction is a dictionary that contains the mapping of fields to be renamed in the VCF file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be - limit: The
limitparameter in thecreate_annotations_viewfunction is an integer that specifies the maximum number of rows to be included in the view. If provided, the function will limit the number of rows in the view to the specified value
Returns
The
create_annotations_viewfunction returns the name of the view that is created based on the fields extracted from the INFO column in the VCF file. This view contains the extracted fields in a structured format for further processing or analysis